# flake8: noqa """ Shim module between Bleach and html5lib. This makes it easier to upgrade the html5lib library without having to change a lot of code. """ import re import string import warnings # ignore html5lib deprecation warnings to use bleach; we are bleach # apply before we import submodules that import html5lib warnings.filterwarnings( "ignore", message="html5lib's sanitizer is deprecated", category=DeprecationWarning, module="bleach._vendor.html5lib", ) from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file HTMLParser, getTreeWalker, ) from bleach._vendor.html5lib import ( constants, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file namespaces, prefixes, ) from bleach._vendor.html5lib.constants import ( _ReparseException as ReparseException, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.base import ( Filter, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( allowed_protocols, allowed_css_properties, allowed_svg_properties, attr_val_is_uri, svg_attr_val_allows_ref, svg_allow_local_href, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( Filter as SanitizerFilter, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._inputstream import ( HTMLInputStream, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.serializer import ( escape, HTMLSerializer, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._tokenizer import ( attributeMap, HTMLTokenizer, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._trie import ( Trie, ) # noqa: E402 module level import not at top of file #: Map of entity name to expanded entity ENTITIES = constants.entities #: Trie of html entity string -> character representation ENTITIES_TRIE = Trie(ENTITIES) #: Token type constants--these never change TAG_TOKEN_TYPES = { constants.tokenTypes["StartTag"], constants.tokenTypes["EndTag"], constants.tokenTypes["EmptyTag"], } TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"] TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"] TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"] TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"] #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 HTML_TAGS = frozenset( ( "a", "abbr", "address", "area", "article", "aside", "audio", "b", "base", "bdi", "bdo", "blockquote", "body", "br", "button", "canvas", "caption", "cite", "code", "col", "colgroup", "data", "datalist", "dd", "del", "details", "dfn", "dialog", "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "iframe", "img", "input", "ins", "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", "menu", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "picture", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "slot", "small", "source", "span", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr", ) ) #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 #: from mozilla on 2019.07.11 #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements HTML_TAGS_BLOCK_LEVEL = frozenset( ( "address", "article", "aside", "blockquote", "details", "dialog", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "main", "nav", "ol", "p", "pre", "section", "table", "ul", ) ) class InputStreamWithMemory: """Wraps an HTMLInputStream to remember characters since last < This wraps existing HTMLInputStream classes to keep track of the stream since the last < which marked an open tag state. """ def __init__(self, inner_stream): self._inner_stream = inner_stream self.reset = self._inner_stream.reset self.position = self._inner_stream.position self._buffer = [] @property def errors(self): return self._inner_stream.errors @property def charEncoding(self): return self._inner_stream.charEncoding @property def changeEncoding(self): return self._inner_stream.changeEncoding def char(self): c = self._inner_stream.char() # char() can return None if EOF, so ignore that if c: self._buffer.append(c) return c def charsUntil(self, characters, opposite=False): chars = self._inner_stream.charsUntil(characters, opposite=opposite) self._buffer.extend(list(chars)) return chars def unget(self, char): if self._buffer: self._buffer.pop(-1) return self._inner_stream.unget(char) def get_tag(self): """Returns the stream history since last '<' Since the buffer starts at the last '<' as as seen by tagOpenState(), we know that everything from that point to when this method is called is the "tag" that is being tokenized. """ return "".join(self._buffer) def start_tag(self): """Resets stream history to just '<' This gets called by tagOpenState() which marks a '<' that denotes an open tag. Any time we see that, we reset the buffer. """ self._buffer = ["<"] class BleachHTMLTokenizer(HTMLTokenizer): """Tokenizer that doesn't consume character entities""" def __init__(self, consume_entities=False, **kwargs): super().__init__(**kwargs) self.consume_entities = consume_entities # Wrap the stream with one that remembers the history self.stream = InputStreamWithMemory(self.stream) # Remember the last token emitted; needed for block element spacing self.emitted_last_token = None def __iter__(self): last_error_token = None for token in super().__iter__(): if last_error_token is not None: if ( last_error_token["data"] == "invalid-character-in-attribute-name" and token["type"] in TAG_TOKEN_TYPES and token.get("data") ): # token["data"] is an html5lib attributeMap # (OrderedDict 3.7+ and dict otherwise) # of attr name to attr value # # Remove attribute names that have ', " or < in them # because those characters are invalid for attribute names. token["data"] = attributeMap( (attr_name, attr_value) for attr_name, attr_value in token["data"].items() if ( '"' not in attr_name and "'" not in attr_name and "<" not in attr_name ) ) last_error_token = None yield token elif ( last_error_token["data"] == "expected-closing-tag-but-got-char" and self.parser.tags is not None and token["data"].lower().strip() not in self.parser.tags ): # We've got either a malformed tag or a pseudo-tag or # something that html5lib wants to turn into a malformed # comment which Bleach clean() will drop so we interfere # with the token stream to handle it more correctly. # # If this is an allowed tag, it's malformed and we just let # the html5lib parser deal with it--we don't enter into this # block. # # If this is not an allowed tag, then we convert it to # characters and it'll get escaped in the sanitizer. token["data"] = self.stream.get_tag() token["type"] = TAG_TOKEN_TYPE_CHARACTERS last_error_token = None yield token elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR: # If the token is a parse error, then let the last_error_token # go, and make token the new last_error_token yield last_error_token last_error_token = token else: yield last_error_token yield token last_error_token = None continue # If the token is a ParseError, we hold on to it so we can get the # next token and potentially fix it. if token["type"] == TAG_TOKEN_TYPE_PARSEERROR: last_error_token = token continue yield token if last_error_token: if last_error_token["data"] == "eof-in-tag-name": # Handle the case where the text being parsed ends with < # followed by a series of characters. It's treated as a tag # name that abruptly ends, but we should treat that like # character data yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} elif last_error_token["data"] in ( "eof-in-attribute-name", "eof-in-attribute-value-no-quotes", ): # Handle the case where the text being parsed ends with < # followed by a series of characters and then space and then # more characters. It's treated as a tag name followed by an # attribute that abruptly ends, but we should treat that like # character data. yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} else: yield last_error_token def consumeEntity(self, allowedChar=None, fromAttribute=False): # If this tokenizer is set to consume entities, then we can let the # superclass do its thing. if self.consume_entities: return super().consumeEntity(allowedChar, fromAttribute) # If this tokenizer is set to not consume entities, then we don't want # to consume and convert them, so this overrides the html5lib tokenizer's # consumeEntity so that it's now a no-op. # # However, when that gets called, it's consumed an &, so we put that back in # the stream. if fromAttribute: self.currentToken["data"][-1][1] += "&" else: self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"}) def tagOpenState(self): # This state marks a < that is either a StartTag, EndTag, EmptyTag, # or ParseError. In all cases, we want to drop any stream history # we've collected so far and we do that by calling start_tag() on # the input stream wrapper. self.stream.start_tag() return super().tagOpenState() def emitCurrentToken(self): token = self.currentToken if ( self.parser.tags is not None and token["type"] in TAG_TOKEN_TYPES and token["name"].lower() not in self.parser.tags ): # If this is a start/end/empty tag for a tag that's not in our # allowed list, then it gets stripped or escaped. In both of these # cases it gets converted to a Characters token. if self.parser.strip: if ( self.emitted_last_token and token["type"] == TAG_TOKEN_TYPE_START and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL ): # If this is a block level tag we're stripping, we drop it # for a newline because that's what a browser would parse # it as new_data = "\n" else: # For all other things being stripped, we throw in an empty # string token new_data = "" else: # If we're escaping the token, we want to escape the exact # original string. Since tokenizing also normalizes data # and this is a tag-like thing, we've lost some information. # So we go back through the stream to get the original # string and use that. new_data = self.stream.get_tag() new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data} self.currentToken = self.emitted_last_token = new_token self.tokenQueue.append(new_token) self.state = self.dataState return self.emitted_last_token = self.currentToken super().emitCurrentToken() class BleachHTMLParser(HTMLParser): """Parser that uses BleachHTMLTokenizer""" def __init__(self, tags, strip, consume_entities, **kwargs): """ :arg tags: set of allowed tags--everything else is either stripped or escaped; if None, then this doesn't look at tags at all :arg strip: whether to strip disallowed tags (True) or escape them (False); if tags=None, then this doesn't have any effect :arg consume_entities: whether to consume entities (default behavior) or leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) """ self.tags = ( frozenset((tag.lower() for tag in tags)) if tags is not None else None ) self.strip = strip self.consume_entities = consume_entities super().__init__(**kwargs) def _parse( self, stream, innerHTML=False, container="div", scripting=True, **kwargs ): # set scripting=True to parse