mirror of
https://github.com/danog/blackfriday.git
synced 2025-01-22 21:31:20 +01:00
Use go.net/html's parser to sanitize HTML.
Use an HTML5 compliant parser that interprets HTML as a browser would to parse the Markdown result and then sanitize based on the result. Escape unrecognized and disallowed HTML in the result. Currently works with a hard coded whitelist of safe HTML tags and attributes.
This commit is contained in:
parent
3ca168f879
commit
41251715ad
71
html.go
71
html.go
@ -43,52 +43,12 @@ const (
|
||||
)
|
||||
|
||||
var (
|
||||
tags = []string{
|
||||
"b",
|
||||
"blockquote",
|
||||
"code",
|
||||
"del",
|
||||
"dd",
|
||||
"dl",
|
||||
"dt",
|
||||
"em",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"i",
|
||||
"kbd",
|
||||
"li",
|
||||
"ol",
|
||||
"p",
|
||||
"pre",
|
||||
"s",
|
||||
"sup",
|
||||
"sub",
|
||||
"strong",
|
||||
"strike",
|
||||
"ul",
|
||||
"table",
|
||||
"tr",
|
||||
"td",
|
||||
"th",
|
||||
"thead",
|
||||
"tbody",
|
||||
|
||||
}
|
||||
|
||||
alignments = []string{
|
||||
"left",
|
||||
"right",
|
||||
"center",
|
||||
}
|
||||
|
||||
urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
|
||||
tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`)
|
||||
anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
|
||||
imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
|
||||
// TODO: improve this regexp to catch all possible entities:
|
||||
htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
|
||||
)
|
||||
@ -820,24 +780,6 @@ func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
|
||||
return false, -1
|
||||
}
|
||||
|
||||
func sanitizeHtml(html []byte) []byte {
|
||||
var result []byte
|
||||
for string(html) != "" {
|
||||
skip, tag, rest := findHtmlTag(html)
|
||||
html = rest
|
||||
result = append(result, skip...)
|
||||
result = append(result, sanitizeTag(tag)...)
|
||||
}
|
||||
return append(result, []byte("\n")...)
|
||||
}
|
||||
|
||||
func sanitizeTag(tag []byte) []byte {
|
||||
if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
|
||||
return tag
|
||||
}
|
||||
return []byte("")
|
||||
}
|
||||
|
||||
func skipUntilChar(text []byte, start int, char byte) int {
|
||||
i := start
|
||||
for i < len(text) && text[i] != char {
|
||||
@ -846,19 +788,6 @@ func skipUntilChar(text []byte, start int, char byte) int {
|
||||
return i
|
||||
}
|
||||
|
||||
func findHtmlTag(html []byte) (skip, tag, rest []byte) {
|
||||
start := skipUntilChar(html, 0, '<')
|
||||
rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
|
||||
if rightAngle > start {
|
||||
skip = html[0:start]
|
||||
tag = html[start : rightAngle+1]
|
||||
rest = html[rightAngle+1:]
|
||||
return
|
||||
}
|
||||
|
||||
return []byte(""), []byte(""), []byte("")
|
||||
}
|
||||
|
||||
func skipSpace(tag []byte, i int) int {
|
||||
for i < len(tag) && isspace(tag[i]) {
|
||||
i++
|
||||
|
@ -20,6 +20,7 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
|
||||
anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
|
||||
)
|
||||
|
||||
|
104
inline_test.go
104
inline_test.go
@ -72,135 +72,135 @@ func doTestsInlineParam(t *testing.T, tests []string, extensions, htmlFlags int)
|
||||
func TestRawHtmlTag(t *testing.T) {
|
||||
tests := []string{
|
||||
"zz <style>p {}</style>\n",
|
||||
"<p>zz p {}</p>\n",
|
||||
"<p>zz <style>p {}</style></p>\n",
|
||||
|
||||
"zz <STYLE>p {}</STYLE>\n",
|
||||
"<p>zz p {}</p>\n",
|
||||
"<p>zz <style>p {}</style></p>\n",
|
||||
|
||||
"<SCRIPT>alert()</SCRIPT>\n",
|
||||
"<p>alert()</p>\n",
|
||||
"<p><script>alert()</script></p>\n",
|
||||
|
||||
"zz <SCRIPT>alert()</SCRIPT>\n",
|
||||
"<p>zz alert()</p>\n",
|
||||
"<p>zz <script>alert()</script></p>\n",
|
||||
|
||||
"zz <script>alert()</script>\n",
|
||||
"<p>zz alert()</p>\n",
|
||||
"<p>zz <script>alert()</script></p>\n",
|
||||
|
||||
" <script>alert()</script>\n",
|
||||
"<p>alert()</p>\n",
|
||||
"<p><script>alert()</script></p>\n",
|
||||
|
||||
"<script>alert()</script>\n",
|
||||
"alert()\n",
|
||||
"<script>alert()</script>\n",
|
||||
|
||||
"<script src='foo'></script>\n",
|
||||
"\n",
|
||||
"<script src='foo'></script>\n",
|
||||
|
||||
"<script src='a>b'></script>\n",
|
||||
"\n",
|
||||
"<script src='a>b'></script>\n",
|
||||
|
||||
"zz <script src='foo'></script>\n",
|
||||
"<p>zz </p>\n",
|
||||
"<p>zz <script src='foo'></script></p>\n",
|
||||
|
||||
"zz <script src=foo></script>\n",
|
||||
"<p>zz </p>\n",
|
||||
"<p>zz <script src=foo></script></p>\n",
|
||||
|
||||
`<script><script src="http://example.com/exploit.js"></SCRIPT></script>`,
|
||||
"\n",
|
||||
"<script><script src="http://example.com/exploit.js"></script></script>\n",
|
||||
|
||||
`'';!--"<XSS>=&{()}`,
|
||||
"<p>'';!--"=&{()}</p>\n",
|
||||
"<p>'';!--"<xss>=&{()}</p>\n",
|
||||
|
||||
"<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
|
||||
"<p></p>\n",
|
||||
"<p><script SRC=http://ha.ckers.org/xss.js></script></p>\n",
|
||||
|
||||
"<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>",
|
||||
"<p></p>\n",
|
||||
"<p><script \nSRC=http://ha.ckers.org/xss.js></script></p>\n",
|
||||
|
||||
`<IMG SRC="javascript:alert('XSS');">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
"<IMG SRC=javascript:alert('XSS')>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
"<IMG SRC=JaVaScRiPt:alert('XSS')>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
"<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<a onmouseover="alert(document.cookie)">xss link</a>`,
|
||||
"<p>xss link</a></p>\n",
|
||||
"<p><a>xss link</a></p>\n",
|
||||
|
||||
"<a onmouseover=alert(document.cookie)>xss link</a>",
|
||||
"<p>xss link</a></p>\n",
|
||||
"<p><a>xss link</a></p>\n",
|
||||
|
||||
// XXX: this doesn't pass yet
|
||||
//`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
|
||||
//"<p></p>\n",
|
||||
`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
|
||||
"<p><img><script>alert(&quot;XSS&quot;)</script>"></p>\n",
|
||||
|
||||
"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC=# onmouseover="alert('xxs')">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC= onmouseover="alert('xxs')">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG onmouseover="alert('xxs')">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
"<IMG SRC=javascript:alert('XSS')>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
"<IMG SRC=javascript:alert('XSS')>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
"<IMG SRC=javascript:alert('XSS')>",
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC="javascriptascript:alert('XSS');">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC="jav	ascript:alert('XSS');">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC="jav
ascript:alert('XSS');">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC="jav
ascript:alert('XSS');">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<IMG SRC="  javascript:alert('XSS');">`,
|
||||
"<p></p>\n",
|
||||
"<p><img></p>\n",
|
||||
|
||||
`<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
|
||||
"<p></p>\n",
|
||||
"<p><script/XSS SRC="http://ha.ckers.org/xss.js"></script></p>\n",
|
||||
|
||||
// XXX: this doesn't pass yet
|
||||
//"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
|
||||
//"\n",
|
||||
"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
|
||||
"<p><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></p>\n",
|
||||
|
||||
`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
|
||||
"<p></p>\n",
|
||||
"<p><script/SRC="http://ha.ckers.org/xss.js"></script></p>\n",
|
||||
|
||||
// XXX: this doesn't pass yet
|
||||
//`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
|
||||
//"",
|
||||
// HTML5 interprets the <script> tag contents as raw test, thus the end
|
||||
// result has double-escaped &quot;
|
||||
`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
|
||||
"<p><<script>alert(&quot;XSS&quot;);//&lt;</script></p>\n",
|
||||
|
||||
// HTML5 parses the </p> within an unclosed <script> tag as text.
|
||||
// Same for the following tests.
|
||||
"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
|
||||
"<p></p>\n",
|
||||
"<p><script SRC=http://ha.ckers.org/xss.js?< B ></p>\n",
|
||||
|
||||
"<SCRIPT SRC=//ha.ckers.org/.j>",
|
||||
"<p></p>\n",
|
||||
"<p><script SRC=//ha.ckers.org/.j></p>\n",
|
||||
|
||||
// XXX: this doesn't pass yet
|
||||
//`<IMG SRC="javascript:alert('XSS')"`,
|
||||
//"",
|
||||
`<IMG SRC="javascript:alert('XSS')"`,
|
||||
"<p><IMG SRC="javascript:alert('XSS')"</p>\n",
|
||||
|
||||
// XXX: this doesn't pass yet
|
||||
//"<iframe src=http://ha.ckers.org/scriptlet.html <",
|
||||
//"",
|
||||
"<iframe src=http://ha.ckers.org/scriptlet.html <",
|
||||
// The hyperlink gets linkified, the <iframe> gets escaped
|
||||
"<p><iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> <</p>\n",
|
||||
}
|
||||
doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
|
||||
}
|
||||
|
@ -298,7 +298,7 @@ func Markdown(input []byte, renderer Renderer, extensions int) []byte {
|
||||
second := secondPass(p, first)
|
||||
|
||||
if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 {
|
||||
second = sanitizeHtml(second)
|
||||
second = sanitizeHtmlSafe(second)
|
||||
}
|
||||
|
||||
return second
|
||||
|
127
sanitize.go
Normal file
127
sanitize.go
Normal file
@ -0,0 +1,127 @@
|
||||
package blackfriday
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"code.google.com/p/go.net/html"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
// Whitelisted element tags, attributes on particular tags, attributes that are
|
||||
// interpreted as protocols (again on particular tags), and allowed protocols.
|
||||
var (
|
||||
whitelistTags map[string]bool
|
||||
whitelistAttrs map[string]map[string]bool
|
||||
protocolAttrs map[string]map[string]bool
|
||||
whitelistProtocols [][]byte
|
||||
)
|
||||
|
||||
func init() {
|
||||
whitelistTags = toSet([]string{
|
||||
"a", "b", "blockquote", "br", "caption", "cite", "code", "col",
|
||||
"colgroup", "dd", "div", "dl", "dt", "em",
|
||||
"h1", "h2", "h3", "h4", "h5", "h6",
|
||||
"i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
|
||||
"sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
|
||||
"ul"})
|
||||
whitelistAttrs = map[string]map[string]bool{
|
||||
"a": toSet([]string{"href", "title"}),
|
||||
"img": toSet([]string{"src", "alt", "title"}),
|
||||
}
|
||||
protocolAttrs = map[string]map[string]bool{
|
||||
"a": toSet([]string{"href"}),
|
||||
"img": toSet([]string{"src"}),
|
||||
}
|
||||
whitelistProtocols = [][]byte{
|
||||
[]byte("http://"),
|
||||
[]byte("https://"),
|
||||
[]byte("ftp://"),
|
||||
[]byte("mailto:"),
|
||||
}
|
||||
}
|
||||
|
||||
func toSet(keys []string) map[string]bool {
|
||||
m := make(map[string]bool, len(keys))
|
||||
for _, k := range keys {
|
||||
m[k] = true
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
|
||||
// be safe elements and attributes. All other HTML is escaped, unsafe attributes
|
||||
// are stripped.
|
||||
func sanitizeHtmlSafe(input []byte) []byte {
|
||||
r := bytes.NewReader(input)
|
||||
var w bytes.Buffer
|
||||
tokenizer := html.NewTokenizer(r)
|
||||
wr := bufio.NewWriter(&w)
|
||||
|
||||
// Iterate through all tokens in the input stream and sanitize them.
|
||||
for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
|
||||
switch t {
|
||||
case html.TextToken:
|
||||
// Text is written escaped.
|
||||
wr.WriteString(tokenizer.Token().String())
|
||||
case html.StartTagToken:
|
||||
// HTML tags are escaped unless whitelisted.
|
||||
tag, hasAttributes := tokenizer.TagName()
|
||||
tagName := string(tag)
|
||||
if whitelistTags[tagName] {
|
||||
wr.WriteString("<")
|
||||
wr.Write(tag)
|
||||
for hasAttributes {
|
||||
var key, val []byte
|
||||
key, val, hasAttributes = tokenizer.TagAttr()
|
||||
attrName := string(key)
|
||||
// Only include whitelisted attributes for the given tagName.
|
||||
tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
|
||||
if ok && tagWhitelistedAttrs[attrName] {
|
||||
// For whitelisted attributes, if it's an attribute that requires
|
||||
// protocol checking, do so and strip it if it's not known to be safe.
|
||||
tagProtocolAttrs, ok := protocolAttrs[tagName]
|
||||
if ok && tagProtocolAttrs[attrName] {
|
||||
if !protocolAllowed(val) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
wr.WriteByte(' ')
|
||||
wr.Write(key)
|
||||
wr.WriteString(`="`)
|
||||
wr.WriteString(html.EscapeString(string(val)))
|
||||
wr.WriteByte('"')
|
||||
}
|
||||
}
|
||||
wr.WriteString(">")
|
||||
} else {
|
||||
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
|
||||
}
|
||||
case html.EndTagToken:
|
||||
// Whitelisted tokens can be written in raw.
|
||||
tag, _ := tokenizer.TagName()
|
||||
if whitelistTags[string(tag)] {
|
||||
wr.Write(tokenizer.Raw())
|
||||
} else {
|
||||
wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
|
||||
}
|
||||
default:
|
||||
panic(fmt.Errorf("Unexpected token type %v", t))
|
||||
}
|
||||
}
|
||||
err := tokenizer.Err()
|
||||
if err != nil && err != io.EOF {
|
||||
panic(tokenizer.Err())
|
||||
}
|
||||
wr.Flush()
|
||||
return w.Bytes()
|
||||
}
|
||||
|
||||
func protocolAllowed(attr []byte) bool {
|
||||
for _, prefix := range whitelistProtocols {
|
||||
if bytes.HasPrefix(attr, prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user