From 41251715adafaa3a6a5b0e624d92fc53fee525fa Mon Sep 17 00:00:00 2001
From: Martin Probst
Date: Sun, 27 Apr 2014 23:40:44 +0200
Subject: [PATCH] Use go.net/html's parser to sanitize HTML.
Use an HTML5 compliant parser that interprets HTML as a browser would to parse
the Markdown result and then sanitize based on the result.
Escape unrecognized and disallowed HTML in the result.
Currently works with a hard coded whitelist of safe HTML tags and attributes.
---
html.go | 71 ---------------------------
inline.go | 1 +
inline_test.go | 104 ++++++++++++++++++++--------------------
markdown.go | 2 +-
sanitize.go | 127 +++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 181 insertions(+), 124 deletions(-)
create mode 100644 sanitize.go
diff --git a/html.go b/html.go
index 54744c2..0c85530 100644
--- a/html.go
+++ b/html.go
@@ -43,52 +43,12 @@ const (
)
var (
- tags = []string{
- "b",
- "blockquote",
- "code",
- "del",
- "dd",
- "dl",
- "dt",
- "em",
- "h1",
- "h2",
- "h3",
- "h4",
- "h5",
- "h6",
- "i",
- "kbd",
- "li",
- "ol",
- "p",
- "pre",
- "s",
- "sup",
- "sub",
- "strong",
- "strike",
- "ul",
- "table",
- "tr",
- "td",
- "th",
- "thead",
- "tbody",
-
- }
-
alignments = []string{
"left",
"right",
"center",
}
- urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
- tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`)
- anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`)
- imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
// TODO: improve this regexp to catch all possible entities:
htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
)
@@ -820,24 +780,6 @@ func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
return false, -1
}
-func sanitizeHtml(html []byte) []byte {
- var result []byte
- for string(html) != "" {
- skip, tag, rest := findHtmlTag(html)
- html = rest
- result = append(result, skip...)
- result = append(result, sanitizeTag(tag)...)
- }
- return append(result, []byte("\n")...)
-}
-
-func sanitizeTag(tag []byte) []byte {
- if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
- return tag
- }
- return []byte("")
-}
-
func skipUntilChar(text []byte, start int, char byte) int {
i := start
for i < len(text) && text[i] != char {
@@ -846,19 +788,6 @@ func skipUntilChar(text []byte, start int, char byte) int {
return i
}
-func findHtmlTag(html []byte) (skip, tag, rest []byte) {
- start := skipUntilChar(html, 0, '<')
- rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
- if rightAngle > start {
- skip = html[0:start]
- tag = html[start : rightAngle+1]
- rest = html[rightAngle+1:]
- return
- }
-
- return []byte(""), []byte(""), []byte("")
-}
-
func skipSpace(tag []byte, i int) int {
for i < len(tag) && isspace(tag[i]) {
i++
diff --git a/inline.go b/inline.go
index 41225ce..b3aaf0f 100644
--- a/inline.go
+++ b/inline.go
@@ -20,6 +20,7 @@ import (
)
var (
+ urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
anchorRe = regexp.MustCompile(`^(]+")?\s?>` + urlRe + `<\/a>)`)
)
diff --git a/inline_test.go b/inline_test.go
index 478fbe0..1937021 100644
--- a/inline_test.go
+++ b/inline_test.go
@@ -72,135 +72,135 @@ func doTestsInlineParam(t *testing.T, tests []string, extensions, htmlFlags int)
func TestRawHtmlTag(t *testing.T) {
tests := []string{
"zz \n",
- "zz p {}
\n",
+ "zz <style>p {}</style>
\n",
"zz \n",
- "zz p {}
\n",
+ "zz <style>p {}</style>
\n",
"\n",
- "alert()
\n",
+ "<script>alert()</script>
\n",
"zz \n",
- "zz alert()
\n",
+ "zz <script>alert()</script>
\n",
"zz \n",
- "zz alert()
\n",
+ "zz <script>alert()</script>
\n",
" \n",
- "alert()
\n",
+ "<script>alert()</script>
\n",
"\n",
- "alert()\n",
+ "<script>alert()</script>\n",
"\n",
- "\n",
+ "<script src='foo'></script>\n",
"\n",
- "\n",
+ "<script src='a>b'></script>\n",
"zz \n",
- "zz
\n",
+ "zz <script src='foo'></script>
\n",
"zz \n",
- "zz
\n",
+ "zz <script src=foo></script>
\n",
``,
- "\n",
+ "<script><script src="http://example.com/exploit.js"></script></script>\n",
`'';!--"=&{()}`,
- "'';!--"=&{()}
\n",
+ "'';!--"<xss>=&{()}
\n",
"",
- "\n",
+ "<script SRC=http://ha.ckers.org/xss.js></script>
\n",
"",
- "\n",
+ "<script \nSRC=http://ha.ckers.org/xss.js></script>
\n",
``,
- "\n",
+ "\n",
"",
- "\n",
+ "\n",
"",
- "\n",
+ "\n",
"",
- "\n",
+ "\n",
`xss link`,
- "xss link
\n",
+ "xss link
\n",
"xss link",
- "xss link
\n",
+ "xss link
\n",
- // XXX: this doesn't pass yet
- //`">`,
- //"\n",
+ `">`,
+ "<script>alert("XSS")</script>">
\n",
"",
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
"",
- "\n",
+ "\n",
"",
- "\n",
+ "\n",
"",
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "\n",
``,
- "\n",
+ "<script/XSS SRC="http://ha.ckers.org/xss.js"></script>
\n",
- // XXX: this doesn't pass yet
- //"",
- //"\n",
+ "",
+ "<body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")>
\n",
``,
- "\n",
+ "<script/SRC="http://ha.ckers.org/xss.js"></script>
\n",
- // XXX: this doesn't pass yet
- //`<`,
- //"",
+ // HTML5 interprets the `,
+ "<<script>alert("XSS");//<</script>
\n",
+ // HTML5 parses the within an unclosed