1
0
mirror of https://github.com/danog/blackfriday.git synced 2025-01-22 21:31:20 +01:00

Use go.net/html's parser to sanitize HTML.

Use an HTML5 compliant parser that interprets HTML as a browser would to parse
the Markdown result and then sanitize based on the result.
Escape unrecognized and disallowed HTML in the result.
Currently works with a hard coded whitelist of safe HTML tags and attributes.
This commit is contained in:
Martin Probst 2014-04-27 23:40:44 +02:00
parent 3ca168f879
commit 41251715ad
5 changed files with 181 additions and 124 deletions

71
html.go
View File

@ -43,52 +43,12 @@ const (
)
var (
tags = []string{
"b",
"blockquote",
"code",
"del",
"dd",
"dl",
"dt",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"i",
"kbd",
"li",
"ol",
"p",
"pre",
"s",
"sup",
"sub",
"strong",
"strike",
"ul",
"table",
"tr",
"td",
"th",
"thead",
"tbody",
}
alignments = []string{
"left",
"right",
"center",
}
urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`)
anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
// TODO: improve this regexp to catch all possible entities:
htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
)
@ -820,24 +780,6 @@ func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
return false, -1
}
func sanitizeHtml(html []byte) []byte {
var result []byte
for string(html) != "" {
skip, tag, rest := findHtmlTag(html)
html = rest
result = append(result, skip...)
result = append(result, sanitizeTag(tag)...)
}
return append(result, []byte("\n")...)
}
func sanitizeTag(tag []byte) []byte {
if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
return tag
}
return []byte("")
}
func skipUntilChar(text []byte, start int, char byte) int {
i := start
for i < len(text) && text[i] != char {
@ -846,19 +788,6 @@ func skipUntilChar(text []byte, start int, char byte) int {
return i
}
func findHtmlTag(html []byte) (skip, tag, rest []byte) {
start := skipUntilChar(html, 0, '<')
rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
if rightAngle > start {
skip = html[0:start]
tag = html[start : rightAngle+1]
rest = html[rightAngle+1:]
return
}
return []byte(""), []byte(""), []byte("")
}
func skipSpace(tag []byte, i int) int {
for i < len(tag) && isspace(tag[i]) {
i++

View File

@ -20,6 +20,7 @@ import (
)
var (
urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
)

View File

@ -72,135 +72,135 @@ func doTestsInlineParam(t *testing.T, tests []string, extensions, htmlFlags int)
func TestRawHtmlTag(t *testing.T) {
tests := []string{
"zz <style>p {}</style>\n",
"<p>zz p {}</p>\n",
"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
"zz <STYLE>p {}</STYLE>\n",
"<p>zz p {}</p>\n",
"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
"<SCRIPT>alert()</SCRIPT>\n",
"<p>alert()</p>\n",
"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
"zz <SCRIPT>alert()</SCRIPT>\n",
"<p>zz alert()</p>\n",
"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
"zz <script>alert()</script>\n",
"<p>zz alert()</p>\n",
"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
" <script>alert()</script>\n",
"<p>alert()</p>\n",
"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
"<script>alert()</script>\n",
"alert()\n",
"&lt;script&gt;alert()&lt;/script&gt;\n",
"<script src='foo'></script>\n",
"\n",
"&lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;\n",
"<script src='a>b'></script>\n",
"\n",
"&lt;script src=&#39;a&gt;b&#39;&gt;&lt;/script&gt;\n",
"zz <script src='foo'></script>\n",
"<p>zz </p>\n",
"<p>zz &lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;</p>\n",
"zz <script src=foo></script>\n",
"<p>zz </p>\n",
"<p>zz &lt;script src=foo&gt;&lt;/script&gt;</p>\n",
`<script><script src="http://example.com/exploit.js"></SCRIPT></script>`,
"\n",
"&lt;script&gt;&lt;script src=&#34;http://example.com/exploit.js&#34;&gt;&lt;/script&gt;&lt;/script&gt;\n",
`'';!--"<XSS>=&{()}`,
"<p>'';!--&quot;=&amp;{()}</p>\n",
"<p>&#39;&#39;;!--&#34;&lt;xss&gt;=&amp;{()}</p>\n",
"<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
"<p></p>\n",
"<p>&lt;script SRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
"<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>",
"<p></p>\n",
"<p>&lt;script \nSRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
`<IMG SRC="javascript:alert('XSS');">`,
"<p></p>\n",
"<p><img></p>\n",
"<IMG SRC=javascript:alert('XSS')>",
"<p></p>\n",
"<p><img></p>\n",
"<IMG SRC=JaVaScRiPt:alert('XSS')>",
"<p></p>\n",
"<p><img></p>\n",
"<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>",
"<p></p>\n",
"<p><img></p>\n",
`<a onmouseover="alert(document.cookie)">xss link</a>`,
"<p>xss link</a></p>\n",
"<p><a>xss link</a></p>\n",
"<a onmouseover=alert(document.cookie)>xss link</a>",
"<p>xss link</a></p>\n",
"<p><a>xss link</a></p>\n",
// XXX: this doesn't pass yet
//`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
//"<p></p>\n",
`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
"<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n",
"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC=# onmouseover="alert('xxs')">`,
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC= onmouseover="alert('xxs')">`,
"<p></p>\n",
"<p><img></p>\n",
`<IMG onmouseover="alert('xxs')">`,
"<p></p>\n",
"<p><img></p>\n",
"<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>",
"<p></p>\n",
"<p><img></p>\n",
"<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>",
"<p></p>\n",
"<p><img></p>\n",
"<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>",
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC="javascriptascript:alert('XSS');">`,
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC="jav&#x09;ascript:alert('XSS');">`,
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC="jav&#x0A;ascript:alert('XSS');">`,
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`,
"<p></p>\n",
"<p><img></p>\n",
`<IMG SRC=" &#14; javascript:alert('XSS');">`,
"<p></p>\n",
"<p><img></p>\n",
`<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
"<p></p>\n",
"<p>&lt;script/XSS SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
// XXX: this doesn't pass yet
//"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
//"\n",
"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
"<p>&lt;body onload!#$%&amp;()*~+-_.,:;?@[/|\\]^`=alert(&#34;XSS&#34;)&gt;</p>\n",
`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
"<p></p>\n",
"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
// XXX: this doesn't pass yet
//`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
//"",
// HTML5 interprets the <script> tag contents as raw test, thus the end
// result has double-escaped &amp;quot;
`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
"<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n",
// HTML5 parses the </p> within an unclosed <script> tag as text.
// Same for the following tests.
"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
"<p></p>\n",
"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n",
"<SCRIPT SRC=//ha.ckers.org/.j>",
"<p></p>\n",
"<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n",
// XXX: this doesn't pass yet
//`<IMG SRC="javascript:alert('XSS')"`,
//"",
`<IMG SRC="javascript:alert('XSS')"`,
"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
// XXX: this doesn't pass yet
//"<iframe src=http://ha.ckers.org/scriptlet.html <",
//"",
"<iframe src=http://ha.ckers.org/scriptlet.html <",
// The hyperlink gets linkified, the <iframe> gets escaped
"<p>&lt;iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> &lt;</p>\n",
}
doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
}

View File

@ -298,7 +298,7 @@ func Markdown(input []byte, renderer Renderer, extensions int) []byte {
second := secondPass(p, first)
if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 {
second = sanitizeHtml(second)
second = sanitizeHtmlSafe(second)
}
return second

127
sanitize.go Normal file
View File

@ -0,0 +1,127 @@
package blackfriday
import (
"bufio"
"bytes"
"code.google.com/p/go.net/html"
"fmt"
"io"
)
// Whitelisted element tags, attributes on particular tags, attributes that are
// interpreted as protocols (again on particular tags), and allowed protocols.
var (
whitelistTags map[string]bool
whitelistAttrs map[string]map[string]bool
protocolAttrs map[string]map[string]bool
whitelistProtocols [][]byte
)
func init() {
whitelistTags = toSet([]string{
"a", "b", "blockquote", "br", "caption", "cite", "code", "col",
"colgroup", "dd", "div", "dl", "dt", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
"sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
"ul"})
whitelistAttrs = map[string]map[string]bool{
"a": toSet([]string{"href", "title"}),
"img": toSet([]string{"src", "alt", "title"}),
}
protocolAttrs = map[string]map[string]bool{
"a": toSet([]string{"href"}),
"img": toSet([]string{"src"}),
}
whitelistProtocols = [][]byte{
[]byte("http://"),
[]byte("https://"),
[]byte("ftp://"),
[]byte("mailto:"),
}
}
func toSet(keys []string) map[string]bool {
m := make(map[string]bool, len(keys))
for _, k := range keys {
m[k] = true
}
return m
}
// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
// be safe elements and attributes. All other HTML is escaped, unsafe attributes
// are stripped.
func sanitizeHtmlSafe(input []byte) []byte {
r := bytes.NewReader(input)
var w bytes.Buffer
tokenizer := html.NewTokenizer(r)
wr := bufio.NewWriter(&w)
// Iterate through all tokens in the input stream and sanitize them.
for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
switch t {
case html.TextToken:
// Text is written escaped.
wr.WriteString(tokenizer.Token().String())
case html.StartTagToken:
// HTML tags are escaped unless whitelisted.
tag, hasAttributes := tokenizer.TagName()
tagName := string(tag)
if whitelistTags[tagName] {
wr.WriteString("<")
wr.Write(tag)
for hasAttributes {
var key, val []byte
key, val, hasAttributes = tokenizer.TagAttr()
attrName := string(key)
// Only include whitelisted attributes for the given tagName.
tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
if ok && tagWhitelistedAttrs[attrName] {
// For whitelisted attributes, if it's an attribute that requires
// protocol checking, do so and strip it if it's not known to be safe.
tagProtocolAttrs, ok := protocolAttrs[tagName]
if ok && tagProtocolAttrs[attrName] {
if !protocolAllowed(val) {
continue
}
}
wr.WriteByte(' ')
wr.Write(key)
wr.WriteString(`="`)
wr.WriteString(html.EscapeString(string(val)))
wr.WriteByte('"')
}
}
wr.WriteString(">")
} else {
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
}
case html.EndTagToken:
// Whitelisted tokens can be written in raw.
tag, _ := tokenizer.TagName()
if whitelistTags[string(tag)] {
wr.Write(tokenizer.Raw())
} else {
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
}
default:
panic(fmt.Errorf("Unexpected token type %v", t))
}
}
err := tokenizer.Err()
if err != nil && err != io.EOF {
panic(tokenizer.Err())
}
wr.Flush()
return w.Bytes()
}
func protocolAllowed(attr []byte) bool {
for _, prefix := range whitelistProtocols {
if bytes.HasPrefix(attr, prefix) {
return true
}
}
return false
}