diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index e6f7bf48..00000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# appveyor.yml - https://www.appveyor.com/docs/lang/python
-# https://www.appveyor.com/docs/windows-images-software/#visual-studio-2022
----
-image: Visual Studio 2022
-environment:
- matrix:
- - PY_PYTHON: 2.7
- TOXENV: py27-base
- - PY_PYTHON: 2.7
- TOXENV: py27-optional
- - PY_PYTHON: 3.7
- TOXENV: py37-base
- - PY_PYTHON: 3.7
- TOXENV: py37-optional
-
-install:
- - git submodule update --init --recursive
- - py --list
- - py -VV
- - py -m pip install --upgrade pip
- - py -m pip install tox
-
-build: off
-
-test_script:
- - py -m tox
-
-after_test:
- - py debug-info.py
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
index 5ed83175..0912abb3 100644
--- a/.github/workflows/python-tox.yml
+++ b/.github/workflows/python-tox.yml
@@ -12,9 +12,6 @@ jobs:
os: [ubuntu-latest, windows-latest]
deps: [base, optional]
include:
- - python: "pypy-2.7"
- os: ubuntu-latest
- deps: base
- python: "pypy-3.10"
os: ubuntu-latest
deps: base
diff --git a/README.rst b/README.rst
index 6a623a43..befc7aaa 100644
--- a/README.rst
+++ b/README.rst
@@ -29,7 +29,7 @@ or:
By default, the ``document`` will be an ``xml.etree`` element instance.
Whenever possible, html5lib chooses the accelerated ``ElementTree``
-implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+implementation.
Two other tree types are supported: ``xml.dom.minidom`` and
``lxml.etree``. To use an alternative format, specify the name of
@@ -41,18 +41,6 @@ a treebuilder:
with open("mydocument.html", "rb") as f:
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
-When using with ``urllib2`` (Python 2), the charset from HTTP should be
-pass into html5lib as follows:
-
-.. code-block:: python
-
- from contextlib import closing
- from urllib2 import urlopen
- import html5lib
-
- with closing(urlopen("http://example.com/")) as f:
- document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
-
When using with ``urllib.request`` (Python 3), the charset from HTTP
should be pass into html5lib as follows:
@@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
Installation
------------
-html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
+html5lib works on CPython 3.8+ and PyPy. To install:
.. code-block:: bash
diff --git a/debug-info.py b/debug-info.py
index b47b8ebf..5523067c 100644
--- a/debug-info.py
+++ b/debug-info.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, unicode_literals
import platform
import sys
@@ -12,7 +11,7 @@
"maxsize": sys.maxsize
}
-search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"]
+search_modules = ["chardet", "genshi", "html5lib", "lxml"]
found_modules = []
for m in search_modules:
diff --git a/doc/conf.py b/doc/conf.py
index d5a1e863..66defcce 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
#
# html5lib documentation build configuration file, created by
# sphinx-quickstart on Wed May 8 00:04:49 2013.
@@ -100,7 +99,7 @@
}
-class CExtMock(object):
+class CExtMock:
"""Required for autodoc on readthedocs.org where you cannot build C extensions."""
def __init__(self, *args, **kwargs):
pass
diff --git a/html5lib/__init__.py b/html5lib/__init__.py
index 7b854f99..d2c68855 100644
--- a/html5lib/__init__.py
+++ b/html5lib/__init__.py
@@ -20,7 +20,6 @@
* :func:`~.serializer.serialize`
"""
-from __future__ import absolute_import, division, unicode_literals
from .html5parser import HTMLParser, parse, parseFragment
from .treebuilders import getTreeBuilder
diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
index d725eabd..f5b6e1f4 100644
--- a/html5lib/_ihatexml.py
+++ b/html5lib/_ihatexml.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import re
import warnings
@@ -181,7 +180,7 @@ def escapeRegexp(string):
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
-class InfosetFilter(object):
+class InfosetFilter:
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self,
diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
index a93b5a4e..abbc3d7a 100644
--- a/html5lib/_inputstream.py
+++ b/html5lib/_inputstream.py
@@ -1,13 +1,12 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-from six.moves import http_client, urllib
+import http.client
+import urllib.response
import codecs
import re
from io import BytesIO, StringIO
-import webencodings
+from .contrib import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import _ReparseException
@@ -48,7 +47,7 @@
charsUntilRegEx = {}
-class BufferedStream(object):
+class BufferedStream:
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
@@ -125,15 +124,15 @@ def _readFromBuffer(self, bytes):
def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
- if (isinstance(source, http_client.HTTPResponse) or
+ if (isinstance(source, http.client.HTTPResponse) or
# Also check for addinfourl wrapping HTTPResponse
(isinstance(source, urllib.response.addbase) and
- isinstance(source.fp, http_client.HTTPResponse))):
+ isinstance(source.fp, http.client.HTTPResponse))):
isUnicode = False
elif hasattr(source, "read"):
- isUnicode = isinstance(source.read(0), text_type)
+ isUnicode = isinstance(source.read(0), str)
else:
- isUnicode = isinstance(source, text_type)
+ isUnicode = isinstance(source, str)
if isUnicode:
encodings = [x for x in kwargs if x.endswith("_encoding")]
@@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs):
return HTMLBinaryInputStream(source, **kwargs)
-class HTMLUnicodeInputStream(object):
+class HTMLUnicodeInputStream:
"""Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing
@@ -673,7 +672,7 @@ def jumpTo(self, bytes):
return True
-class EncodingParser(object):
+class EncodingParser:
"""Mini parser for detecting character encoding from meta elements"""
def __init__(self, data):
@@ -861,7 +860,7 @@ def getAttribute(self):
attrValue.append(c)
-class ContentAttrParser(object):
+class ContentAttrParser:
def __init__(self, data):
assert isinstance(data, bytes)
self.data = data
diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
index 4748a197..75dab441 100644
--- a/html5lib/_tokenizer.py
+++ b/html5lib/_tokenizer.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import unichr as chr
from collections import deque, OrderedDict
from sys import version_info
@@ -24,7 +21,7 @@
attributeMap = OrderedDict
-class HTMLTokenizer(object):
+class HTMLTokenizer:
""" This class takes care of tokenizing HTML.
* self.currentToken
diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
index 07bad5d3..df8912a0 100644
--- a/html5lib/_trie/__init__.py
+++ b/html5lib/_trie/__init__.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from .py import Trie
diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
index 6b71975f..63927ee4 100644
--- a/html5lib/_trie/_base.py
+++ b/html5lib/_trie/_base.py
@@ -1,9 +1,5 @@
-from __future__ import absolute_import, division, unicode_literals
-try:
- from collections.abc import Mapping
-except ImportError: # Python 2.7
- from collections import Mapping
+from collections.abc import Mapping
class Trie(Mapping):
diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
index c2ba3da7..bc6363c4 100644
--- a/html5lib/_trie/py.py
+++ b/html5lib/_trie/py.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
from bisect import bisect_left
from ._base import Trie as ABCTrie
@@ -8,7 +5,7 @@
class Trie(ABCTrie):
def __init__(self, data):
- if not all(isinstance(x, text_type) for x in data.keys()):
+ if not all(isinstance(x, str) for x in data.keys()):
raise TypeError("All keys must be strings")
self._data = data
diff --git a/html5lib/_utils.py b/html5lib/_utils.py
index 7e23ee57..5853e81d 100644
--- a/html5lib/_utils.py
+++ b/html5lib/_utils.py
@@ -1,21 +1,9 @@
-from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
-try:
- from collections.abc import Mapping
-except ImportError:
- from collections import Mapping
-
-from six import text_type, PY3
+from collections.abc import Mapping
-if PY3:
- import xml.etree.ElementTree as default_etree
-else:
- try:
- import xml.etree.cElementTree as default_etree
- except ImportError:
- import xml.etree.ElementTree as default_etree
+import xml.etree.ElementTree as default_etree
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@@ -31,10 +19,10 @@
# escapes.
try:
_x = eval('"\\uD800"') # pylint:disable=eval-used
- if not isinstance(_x, text_type):
+ if not isinstance(_x, str):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"') # pylint:disable=eval-used
- assert isinstance(_x, text_type)
+ assert isinstance(_x, str)
except Exception:
supports_lone_surrogates = False
else:
@@ -122,7 +110,7 @@ def moduleFactoryFactory(factory):
moduleCache = {}
def moduleFactory(baseModule, *args, **kwargs):
- if isinstance(ModuleType.__name__, type("")):
+ if isinstance(ModuleType.__name__, str):
name = "_%s_factory" % baseModule.__name__
else:
name = b"_%s_factory" % baseModule.__name__
diff --git a/html5lib/constants.py b/html5lib/constants.py
index 2fa4146d..a4b1efa1 100644
--- a/html5lib/constants.py
+++ b/html5lib/constants.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import string
diff --git a/html5lib/contrib/__init__.py b/html5lib/contrib/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/html5lib/contrib/webencodings/LiCENSE b/html5lib/contrib/webencodings/LiCENSE
new file mode 100644
index 00000000..3d0d3e70
--- /dev/null
+++ b/html5lib/contrib/webencodings/LiCENSE
@@ -0,0 +1,31 @@
+Copyright (c) 2012 by Simon Sapin.
+
+Some rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * The names of the contributors may not be used to endorse or
+ promote products derived from this software without specific
+ prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/html5lib/contrib/webencodings/__init__.py b/html5lib/contrib/webencodings/__init__.py
new file mode 100644
index 00000000..2d7bf148
--- /dev/null
+++ b/html5lib/contrib/webencodings/__init__.py
@@ -0,0 +1,340 @@
+# coding: utf-8
+"""
+
+ webencodings
+ ~~~~~~~~~~~~
+
+ This is a Python implementation of the `WHATWG Encoding standard
+ `. See README for details.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+import codecs
+
+from .labels import LABELS
+
+
+VERSION = '0.6-dev'
+
+
+# Some names in Encoding are not valid Python aliases. Remap these.
+PYTHON_NAMES = {
+ 'iso-8859-8-i': 'iso-8859-8',
+ 'x-mac-cyrillic': 'mac-cyrillic',
+ 'macintosh': 'mac-roman',
+ 'windows-874': 'cp874'}
+
+CACHE = {}
+
+
+def ascii_lower(string):
+ r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
+
+ :param string: A Unicode string.
+ :returns: A new Unicode string.
+
+ This is used for `ASCII case-insensitive
+ `_
+ matching of encoding labels.
+ The same matching is also used, among other things,
+ for `CSS keywords `_.
+
+ This is different from the :meth:`~py:str.lower` method of Unicode strings
+ which also affect non-ASCII characters,
+ sometimes mapping them into the ASCII range:
+
+ >>> keyword = u'Bac\N{KELVIN SIGN}ground'
+ >>> assert keyword.lower() == u'background'
+ >>> assert ascii_lower(keyword) != keyword.lower()
+ >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
+
+ """
+ # This turns out to be faster than unicode.translate()
+ return string.encode('utf8').lower().decode('utf8')
+
+
+def lookup(label):
+ """
+ Look for an encoding by its label.
+ This is the spec’s `get an encoding
+ `_ algorithm.
+ Supported labels are listed there.
+
+ :param label: A string.
+ :returns:
+ An :class:`Encoding` object, or :obj:`None` for an unknown label.
+
+ """
+ # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
+ label = ascii_lower(label.strip('\t\n\f\r '))
+ name = LABELS.get(label)
+ if name is None:
+ return None
+ encoding = CACHE.get(name)
+ if encoding is None:
+ if name == 'x-user-defined':
+ from .x_user_defined import codec_info
+ else:
+ python_name = PYTHON_NAMES.get(name, name)
+ # Any python_name value that gets to here should be valid.
+ codec_info = codecs.lookup(python_name)
+ encoding = Encoding(name, codec_info)
+ CACHE[name] = encoding
+ return encoding
+
+
+def _get_encoding(encoding_or_label):
+ """
+ Accept either an encoding object or label.
+
+ :param encoding: An :class:`Encoding` object or a label string.
+ :returns: An :class:`Encoding` object.
+ :raises: :exc:`~exceptions.LookupError` for an unknown label.
+
+ """
+ if hasattr(encoding_or_label, 'codec_info'):
+ return encoding_or_label
+
+ encoding = lookup(encoding_or_label)
+ if encoding is None:
+ raise LookupError('Unknown encoding label: %r' % encoding_or_label)
+ return encoding
+
+
+class Encoding(object):
+ """Reresents a character encoding such as UTF-8,
+ that can be used for decoding or encoding.
+
+ .. attribute:: name
+
+ Canonical name of the encoding
+
+ .. attribute:: codec_info
+
+ The actual implementation of the encoding,
+ a stdlib :class:`~codecs.CodecInfo` object.
+ See :func:`codecs.register`.
+
+ """
+ def __init__(self, name, codec_info):
+ self.name = name
+ self.codec_info = codec_info
+
+ def __repr__(self):
+ return '' % self.name
+
+
+#: The UTF-8 encoding. Should be used for new content and formats.
+UTF8 = lookup('utf-8')
+
+_UTF16LE = lookup('utf-16le')
+_UTF16BE = lookup('utf-16be')
+
+
+def decode(input, fallback_encoding, errors='replace'):
+ """
+ Decode a single string.
+
+ :param input: A byte string
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :return:
+ A ``(output, encoding)`` tuple of an Unicode string
+ and an :obj:`Encoding`.
+
+ """
+ # Fail early if `encoding` is an invalid label.
+ fallback_encoding = _get_encoding(fallback_encoding)
+ bom_encoding, input = _detect_bom(input)
+ encoding = bom_encoding or fallback_encoding
+ return encoding.codec_info.decode(input, errors)[0], encoding
+
+
+def _detect_bom(input):
+ """Return (bom_encoding, input), with any BOM removed from the input."""
+ if input.startswith(b'\xFF\xFE'):
+ return _UTF16LE, input[2:]
+ if input.startswith(b'\xFE\xFF'):
+ return _UTF16BE, input[2:]
+ if input.startswith(b'\xEF\xBB\xBF'):
+ return UTF8, input[3:]
+ return None, input
+
+
+def encode(input, encoding=UTF8, errors='strict'):
+ """
+ Encode a single string.
+
+ :param input: An Unicode string.
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :return: A byte string.
+
+ """
+ return _get_encoding(encoding).codec_info.encode(input, errors)[0]
+
+
+def iter_decode(input, fallback_encoding, errors='replace'):
+ """
+ "Pull"-based decoder.
+
+ :param input:
+ An iterable of byte strings.
+
+ The input is first consumed just enough to determine the encoding
+ based on the precense of a BOM,
+ then consumed on demand when the return value is.
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :returns:
+ An ``(output, encoding)`` tuple.
+ :obj:`output` is an iterable of Unicode strings,
+ :obj:`encoding` is the :obj:`Encoding` that is being used.
+
+ """
+
+ decoder = IncrementalDecoder(fallback_encoding, errors)
+ generator = _iter_decode_generator(input, decoder)
+ encoding = next(generator)
+ return generator, encoding
+
+
+def _iter_decode_generator(input, decoder):
+ """Return a generator that first yields the :obj:`Encoding`,
+ then yields output chukns as Unicode strings.
+
+ """
+ decode = decoder.decode
+ input = iter(input)
+ for chunck in input:
+ output = decode(chunck)
+ if output:
+ assert decoder.encoding is not None
+ yield decoder.encoding
+ yield output
+ break
+ else:
+ # Input exhausted without determining the encoding
+ output = decode(b'', final=True)
+ assert decoder.encoding is not None
+ yield decoder.encoding
+ if output:
+ yield output
+ return
+
+ for chunck in input:
+ output = decode(chunck)
+ if output:
+ yield output
+ output = decode(b'', final=True)
+ if output:
+ yield output
+
+
+def iter_encode(input, encoding=UTF8, errors='strict'):
+ """
+ “Pull”-based encoder.
+
+ :param input: An iterable of Unicode strings.
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :returns: An iterable of byte strings.
+
+ """
+ # Fail early if `encoding` is an invalid label.
+ encode = IncrementalEncoder(encoding, errors).encode
+ return _iter_encode_generator(input, encode)
+
+
+def _iter_encode_generator(input, encode):
+ for chunck in input:
+ output = encode(chunck)
+ if output:
+ yield output
+ output = encode('', final=True)
+ if output:
+ yield output
+
+
+class IncrementalDecoder(object):
+ """
+ “Push”-based decoder.
+
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+ """
+ def __init__(self, fallback_encoding, errors='replace'):
+ # Fail early if `encoding` is an invalid label.
+ self._fallback_encoding = _get_encoding(fallback_encoding)
+ self._errors = errors
+ self._buffer = b''
+ self._decoder = None
+ #: The actual :class:`Encoding` that is being used,
+ #: or :obj:`None` if that is not determined yet.
+ #: (Ie. if there is not enough input yet to determine
+ #: if there is a BOM.)
+ self.encoding = None # Not known yet.
+
+ def decode(self, input, final=False):
+ """Decode one chunk of the input.
+
+ :param input: A byte string.
+ :param final:
+ Indicate that no more input is available.
+ Must be :obj:`True` if this is the last call.
+ :returns: An Unicode string.
+
+ """
+ decoder = self._decoder
+ if decoder is not None:
+ return decoder(input, final)
+
+ input = self._buffer + input
+ encoding, input = _detect_bom(input)
+ if encoding is None:
+ if len(input) < 3 and not final: # Not enough data yet.
+ self._buffer = input
+ return ''
+ else: # No BOM
+ encoding = self._fallback_encoding
+ decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
+ self._decoder = decoder
+ self.encoding = encoding
+ return decoder(input, final)
+
+
+class IncrementalEncoder(object):
+ """
+ “Push”-based encoder.
+
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+ .. method:: encode(input, final=False)
+
+ :param input: An Unicode string.
+ :param final:
+ Indicate that no more input is available.
+ Must be :obj:`True` if this is the last call.
+ :returns: A byte string.
+
+ """
+ def __init__(self, encoding=UTF8, errors='strict'):
+ encoding = _get_encoding(encoding)
+ self.encode = encoding.codec_info.incrementalencoder(errors).encode
diff --git a/html5lib/contrib/webencodings/labels.py b/html5lib/contrib/webencodings/labels.py
new file mode 100644
index 00000000..29cbf91e
--- /dev/null
+++ b/html5lib/contrib/webencodings/labels.py
@@ -0,0 +1,231 @@
+"""
+
+ webencodings.labels
+ ~~~~~~~~~~~~~~~~~~~
+
+ Map encoding labels to their name.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+ 'unicode-1-1-utf-8': 'utf-8',
+ 'utf-8': 'utf-8',
+ 'utf8': 'utf-8',
+ '866': 'ibm866',
+ 'cp866': 'ibm866',
+ 'csibm866': 'ibm866',
+ 'ibm866': 'ibm866',
+ 'csisolatin2': 'iso-8859-2',
+ 'iso-8859-2': 'iso-8859-2',
+ 'iso-ir-101': 'iso-8859-2',
+ 'iso8859-2': 'iso-8859-2',
+ 'iso88592': 'iso-8859-2',
+ 'iso_8859-2': 'iso-8859-2',
+ 'iso_8859-2:1987': 'iso-8859-2',
+ 'l2': 'iso-8859-2',
+ 'latin2': 'iso-8859-2',
+ 'csisolatin3': 'iso-8859-3',
+ 'iso-8859-3': 'iso-8859-3',
+ 'iso-ir-109': 'iso-8859-3',
+ 'iso8859-3': 'iso-8859-3',
+ 'iso88593': 'iso-8859-3',
+ 'iso_8859-3': 'iso-8859-3',
+ 'iso_8859-3:1988': 'iso-8859-3',
+ 'l3': 'iso-8859-3',
+ 'latin3': 'iso-8859-3',
+ 'csisolatin4': 'iso-8859-4',
+ 'iso-8859-4': 'iso-8859-4',
+ 'iso-ir-110': 'iso-8859-4',
+ 'iso8859-4': 'iso-8859-4',
+ 'iso88594': 'iso-8859-4',
+ 'iso_8859-4': 'iso-8859-4',
+ 'iso_8859-4:1988': 'iso-8859-4',
+ 'l4': 'iso-8859-4',
+ 'latin4': 'iso-8859-4',
+ 'csisolatincyrillic': 'iso-8859-5',
+ 'cyrillic': 'iso-8859-5',
+ 'iso-8859-5': 'iso-8859-5',
+ 'iso-ir-144': 'iso-8859-5',
+ 'iso8859-5': 'iso-8859-5',
+ 'iso88595': 'iso-8859-5',
+ 'iso_8859-5': 'iso-8859-5',
+ 'iso_8859-5:1988': 'iso-8859-5',
+ 'arabic': 'iso-8859-6',
+ 'asmo-708': 'iso-8859-6',
+ 'csiso88596e': 'iso-8859-6',
+ 'csiso88596i': 'iso-8859-6',
+ 'csisolatinarabic': 'iso-8859-6',
+ 'ecma-114': 'iso-8859-6',
+ 'iso-8859-6': 'iso-8859-6',
+ 'iso-8859-6-e': 'iso-8859-6',
+ 'iso-8859-6-i': 'iso-8859-6',
+ 'iso-ir-127': 'iso-8859-6',
+ 'iso8859-6': 'iso-8859-6',
+ 'iso88596': 'iso-8859-6',
+ 'iso_8859-6': 'iso-8859-6',
+ 'iso_8859-6:1987': 'iso-8859-6',
+ 'csisolatingreek': 'iso-8859-7',
+ 'ecma-118': 'iso-8859-7',
+ 'elot_928': 'iso-8859-7',
+ 'greek': 'iso-8859-7',
+ 'greek8': 'iso-8859-7',
+ 'iso-8859-7': 'iso-8859-7',
+ 'iso-ir-126': 'iso-8859-7',
+ 'iso8859-7': 'iso-8859-7',
+ 'iso88597': 'iso-8859-7',
+ 'iso_8859-7': 'iso-8859-7',
+ 'iso_8859-7:1987': 'iso-8859-7',
+ 'sun_eu_greek': 'iso-8859-7',
+ 'csiso88598e': 'iso-8859-8',
+ 'csisolatinhebrew': 'iso-8859-8',
+ 'hebrew': 'iso-8859-8',
+ 'iso-8859-8': 'iso-8859-8',
+ 'iso-8859-8-e': 'iso-8859-8',
+ 'iso-ir-138': 'iso-8859-8',
+ 'iso8859-8': 'iso-8859-8',
+ 'iso88598': 'iso-8859-8',
+ 'iso_8859-8': 'iso-8859-8',
+ 'iso_8859-8:1988': 'iso-8859-8',
+ 'visual': 'iso-8859-8',
+ 'csiso88598i': 'iso-8859-8-i',
+ 'iso-8859-8-i': 'iso-8859-8-i',
+ 'logical': 'iso-8859-8-i',
+ 'csisolatin6': 'iso-8859-10',
+ 'iso-8859-10': 'iso-8859-10',
+ 'iso-ir-157': 'iso-8859-10',
+ 'iso8859-10': 'iso-8859-10',
+ 'iso885910': 'iso-8859-10',
+ 'l6': 'iso-8859-10',
+ 'latin6': 'iso-8859-10',
+ 'iso-8859-13': 'iso-8859-13',
+ 'iso8859-13': 'iso-8859-13',
+ 'iso885913': 'iso-8859-13',
+ 'iso-8859-14': 'iso-8859-14',
+ 'iso8859-14': 'iso-8859-14',
+ 'iso885914': 'iso-8859-14',
+ 'csisolatin9': 'iso-8859-15',
+ 'iso-8859-15': 'iso-8859-15',
+ 'iso8859-15': 'iso-8859-15',
+ 'iso885915': 'iso-8859-15',
+ 'iso_8859-15': 'iso-8859-15',
+ 'l9': 'iso-8859-15',
+ 'iso-8859-16': 'iso-8859-16',
+ 'cskoi8r': 'koi8-r',
+ 'koi': 'koi8-r',
+ 'koi8': 'koi8-r',
+ 'koi8-r': 'koi8-r',
+ 'koi8_r': 'koi8-r',
+ 'koi8-u': 'koi8-u',
+ 'csmacintosh': 'macintosh',
+ 'mac': 'macintosh',
+ 'macintosh': 'macintosh',
+ 'x-mac-roman': 'macintosh',
+ 'dos-874': 'windows-874',
+ 'iso-8859-11': 'windows-874',
+ 'iso8859-11': 'windows-874',
+ 'iso885911': 'windows-874',
+ 'tis-620': 'windows-874',
+ 'windows-874': 'windows-874',
+ 'cp1250': 'windows-1250',
+ 'windows-1250': 'windows-1250',
+ 'x-cp1250': 'windows-1250',
+ 'cp1251': 'windows-1251',
+ 'windows-1251': 'windows-1251',
+ 'x-cp1251': 'windows-1251',
+ 'ansi_x3.4-1968': 'windows-1252',
+ 'ascii': 'windows-1252',
+ 'cp1252': 'windows-1252',
+ 'cp819': 'windows-1252',
+ 'csisolatin1': 'windows-1252',
+ 'ibm819': 'windows-1252',
+ 'iso-8859-1': 'windows-1252',
+ 'iso-ir-100': 'windows-1252',
+ 'iso8859-1': 'windows-1252',
+ 'iso88591': 'windows-1252',
+ 'iso_8859-1': 'windows-1252',
+ 'iso_8859-1:1987': 'windows-1252',
+ 'l1': 'windows-1252',
+ 'latin1': 'windows-1252',
+ 'us-ascii': 'windows-1252',
+ 'windows-1252': 'windows-1252',
+ 'x-cp1252': 'windows-1252',
+ 'cp1253': 'windows-1253',
+ 'windows-1253': 'windows-1253',
+ 'x-cp1253': 'windows-1253',
+ 'cp1254': 'windows-1254',
+ 'csisolatin5': 'windows-1254',
+ 'iso-8859-9': 'windows-1254',
+ 'iso-ir-148': 'windows-1254',
+ 'iso8859-9': 'windows-1254',
+ 'iso88599': 'windows-1254',
+ 'iso_8859-9': 'windows-1254',
+ 'iso_8859-9:1989': 'windows-1254',
+ 'l5': 'windows-1254',
+ 'latin5': 'windows-1254',
+ 'windows-1254': 'windows-1254',
+ 'x-cp1254': 'windows-1254',
+ 'cp1255': 'windows-1255',
+ 'windows-1255': 'windows-1255',
+ 'x-cp1255': 'windows-1255',
+ 'cp1256': 'windows-1256',
+ 'windows-1256': 'windows-1256',
+ 'x-cp1256': 'windows-1256',
+ 'cp1257': 'windows-1257',
+ 'windows-1257': 'windows-1257',
+ 'x-cp1257': 'windows-1257',
+ 'cp1258': 'windows-1258',
+ 'windows-1258': 'windows-1258',
+ 'x-cp1258': 'windows-1258',
+ 'x-mac-cyrillic': 'x-mac-cyrillic',
+ 'x-mac-ukrainian': 'x-mac-cyrillic',
+ 'chinese': 'gbk',
+ 'csgb2312': 'gbk',
+ 'csiso58gb231280': 'gbk',
+ 'gb2312': 'gbk',
+ 'gb_2312': 'gbk',
+ 'gb_2312-80': 'gbk',
+ 'gbk': 'gbk',
+ 'iso-ir-58': 'gbk',
+ 'x-gbk': 'gbk',
+ 'gb18030': 'gb18030',
+ 'hz-gb-2312': 'hz-gb-2312',
+ 'big5': 'big5',
+ 'big5-hkscs': 'big5',
+ 'cn-big5': 'big5',
+ 'csbig5': 'big5',
+ 'x-x-big5': 'big5',
+ 'cseucpkdfmtjapanese': 'euc-jp',
+ 'euc-jp': 'euc-jp',
+ 'x-euc-jp': 'euc-jp',
+ 'csiso2022jp': 'iso-2022-jp',
+ 'iso-2022-jp': 'iso-2022-jp',
+ 'csshiftjis': 'shift_jis',
+ 'ms_kanji': 'shift_jis',
+ 'shift-jis': 'shift_jis',
+ 'shift_jis': 'shift_jis',
+ 'sjis': 'shift_jis',
+ 'windows-31j': 'shift_jis',
+ 'x-sjis': 'shift_jis',
+ 'cseuckr': 'euc-kr',
+ 'csksc56011987': 'euc-kr',
+ 'euc-kr': 'euc-kr',
+ 'iso-ir-149': 'euc-kr',
+ 'korean': 'euc-kr',
+ 'ks_c_5601-1987': 'euc-kr',
+ 'ks_c_5601-1989': 'euc-kr',
+ 'ksc5601': 'euc-kr',
+ 'ksc_5601': 'euc-kr',
+ 'windows-949': 'euc-kr',
+ 'csiso2022kr': 'iso-2022-kr',
+ 'iso-2022-kr': 'iso-2022-kr',
+ 'utf-16be': 'utf-16be',
+ 'utf-16': 'utf-16le',
+ 'utf-16le': 'utf-16le',
+ 'x-user-defined': 'x-user-defined',
+}
diff --git a/html5lib/contrib/webencodings/mklabels.py b/html5lib/contrib/webencodings/mklabels.py
new file mode 100644
index 00000000..22168eb2
--- /dev/null
+++ b/html5lib/contrib/webencodings/mklabels.py
@@ -0,0 +1,56 @@
+"""
+
+ webencodings.mklabels
+ ~~~~~~~~~~~~~~~~~~~~~
+
+ Regenarate the webencodings.labels module.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+import json
+from urllib.request import urlopen
+
+
+def assert_lower(string):
+ assert string == string.lower()
+ return string
+
+
+def generate(url):
+ parts = ['''\
+"""
+
+ webencodings.labels
+ ~~~~~~~~~~~~~~~~~~~
+
+ Map encoding labels to their name.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+''']
+ labels = [
+ (repr(assert_lower(label)).lstrip('u'),
+ repr(encoding['name']).lstrip('u'))
+ for category in json.loads(urlopen(url).read().decode('ascii'))
+ for encoding in category['encodings']
+ for label in encoding['labels']]
+ max_len = max(len(label) for label, name in labels)
+ parts.extend(
+ ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
+ for label, name in labels)
+ parts.append('}')
+ return ''.join(parts)
+
+
+if __name__ == '__main__':
+ print(generate('http://encoding.spec.whatwg.org/encodings.json'))
diff --git a/html5lib/contrib/webencodings/tests.py b/html5lib/contrib/webencodings/tests.py
new file mode 100644
index 00000000..4852d1f5
--- /dev/null
+++ b/html5lib/contrib/webencodings/tests.py
@@ -0,0 +1,151 @@
+# coding: utf-8
+"""
+
+ webencodings.tests
+ ~~~~~~~~~~~~~~~~~~
+
+ A basic test suite for Encoding.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
+ IncrementalDecoder, IncrementalEncoder, UTF8)
+
+
+def assert_raises(exception, function, *args, **kwargs):
+ try:
+ function(*args, **kwargs)
+ except exception:
+ return
+ else: # pragma: no cover
+ raise AssertionError('Did not raise %s.' % exception)
+
+
+def test_labels():
+ assert lookup('utf-8').name == 'utf-8'
+ assert lookup('Utf-8').name == 'utf-8'
+ assert lookup('UTF-8').name == 'utf-8'
+ assert lookup('utf8').name == 'utf-8'
+ assert lookup('utf8').name == 'utf-8'
+ assert lookup('utf8 ').name == 'utf-8'
+ assert lookup(' \r\nutf8\t').name == 'utf-8'
+ assert lookup('u8') is None # Python label.
+ assert lookup('utf-8 ') is None # Non-ASCII white space.
+
+ assert lookup('US-ASCII').name == 'windows-1252'
+ assert lookup('iso-8859-1').name == 'windows-1252'
+ assert lookup('latin1').name == 'windows-1252'
+ assert lookup('LATIN1').name == 'windows-1252'
+ assert lookup('latin-1') is None
+ assert lookup('LATİN1') is None # ASCII-only case insensitivity.
+
+
+def test_all_labels():
+ for label in LABELS:
+ assert decode(b'', label) == ('', lookup(label))
+ assert encode('', label) == b''
+ for repeat in [0, 1, 12]:
+ output, _ = iter_decode([b''] * repeat, label)
+ assert list(output) == []
+ assert list(iter_encode([''] * repeat, label)) == []
+ decoder = IncrementalDecoder(label)
+ assert decoder.decode(b'') == ''
+ assert decoder.decode(b'', final=True) == ''
+ encoder = IncrementalEncoder(label)
+ assert encoder.encode('') == b''
+ assert encoder.encode('', final=True) == b''
+ # All encoding names are valid labels too:
+ for name in set(LABELS.values()):
+ assert lookup(name).name == name
+
+
+def test_invalid_label():
+ assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
+ assert_raises(LookupError, encode, 'é', 'invalid')
+ assert_raises(LookupError, iter_decode, [], 'invalid')
+ assert_raises(LookupError, iter_encode, [], 'invalid')
+ assert_raises(LookupError, IncrementalDecoder, 'invalid')
+ assert_raises(LookupError, IncrementalEncoder, 'invalid')
+
+
+def test_decode():
+ assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
+ assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
+ assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
+ assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
+ assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
+ assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
+
+ assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
+ assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
+ assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
+ assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
+
+ assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
+ assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
+ assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
+
+ assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
+ assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
+ assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
+
+
+def test_encode():
+ assert encode('é', 'latin1') == b'\xe9'
+ assert encode('é', 'utf8') == b'\xc3\xa9'
+ assert encode('é', 'utf8') == b'\xc3\xa9'
+ assert encode('é', 'utf-16') == b'\xe9\x00'
+ assert encode('é', 'utf-16le') == b'\xe9\x00'
+ assert encode('é', 'utf-16be') == b'\x00\xe9'
+
+
+def test_iter_decode():
+ def iter_decode_to_string(input, fallback_encoding):
+ output, _encoding = iter_decode(input, fallback_encoding)
+ return ''.join(output)
+ assert iter_decode_to_string([], 'latin1') == ''
+ assert iter_decode_to_string([b''], 'latin1') == ''
+ assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
+ assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
+ assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
+ assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
+ assert iter_decode_to_string([
+ b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
+ assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
+ assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
+
+
+def test_iter_encode():
+ assert b''.join(iter_encode([], 'latin1')) == b''
+ assert b''.join(iter_encode([''], 'latin1')) == b''
+ assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
+ assert b''.join(iter_encode([
+ '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
+
+
+def test_x_user_defined():
+ encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
+ decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
+ encoded = b'aa'
+ decoded = 'aa'
+ assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
+ assert encode(decoded, 'x-user-defined') == encoded
diff --git a/html5lib/contrib/webencodings/x_user_defined.py b/html5lib/contrib/webencodings/x_user_defined.py
new file mode 100644
index 00000000..8467f4f0
--- /dev/null
+++ b/html5lib/contrib/webencodings/x_user_defined.py
@@ -0,0 +1,323 @@
+# coding: utf-8
+"""
+
+ webencodings.x_user_defined
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ An implementation of the x-user-defined encoding.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+import codecs
+
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+ def encode(self, input, errors='strict'):
+ return codecs.charmap_encode(input, errors, encoding_table)
+
+ def decode(self, input, errors='strict'):
+ return codecs.charmap_decode(input, errors, decoding_table)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def encode(self, input, final=False):
+ return codecs.charmap_encode(input, self.errors, encoding_table)[0]
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+ def decode(self, input, final=False):
+ return codecs.charmap_decode(input, self.errors, decoding_table)[0]
+
+
+class StreamWriter(Codec, codecs.StreamWriter):
+ pass
+
+
+class StreamReader(Codec, codecs.StreamReader):
+ pass
+
+
+### encodings module API
+
+codec_info = codecs.CodecInfo(
+ name='x-user-defined',
+ encode=Codec().encode,
+ decode=Codec().decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+)
+
+
+### Decoding Table
+
+# Python 3:
+# for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700))
+decoding_table = (
+ '\x00'
+ '\x01'
+ '\x02'
+ '\x03'
+ '\x04'
+ '\x05'
+ '\x06'
+ '\x07'
+ '\x08'
+ '\t'
+ '\n'
+ '\x0b'
+ '\x0c'
+ '\r'
+ '\x0e'
+ '\x0f'
+ '\x10'
+ '\x11'
+ '\x12'
+ '\x13'
+ '\x14'
+ '\x15'
+ '\x16'
+ '\x17'
+ '\x18'
+ '\x19'
+ '\x1a'
+ '\x1b'
+ '\x1c'
+ '\x1d'
+ '\x1e'
+ '\x1f'
+ ' '
+ '!'
+ '"'
+ '#'
+ '$'
+ '%'
+ '&'
+ "'"
+ '('
+ ')'
+ '*'
+ '+'
+ ','
+ '-'
+ '.'
+ '/'
+ '0'
+ '1'
+ '2'
+ '3'
+ '4'
+ '5'
+ '6'
+ '7'
+ '8'
+ '9'
+ ':'
+ ';'
+ '<'
+ '='
+ '>'
+ '?'
+ '@'
+ 'A'
+ 'B'
+ 'C'
+ 'D'
+ 'E'
+ 'F'
+ 'G'
+ 'H'
+ 'I'
+ 'J'
+ 'K'
+ 'L'
+ 'M'
+ 'N'
+ 'O'
+ 'P'
+ 'Q'
+ 'R'
+ 'S'
+ 'T'
+ 'U'
+ 'V'
+ 'W'
+ 'X'
+ 'Y'
+ 'Z'
+ '['
+ '\\'
+ ']'
+ '^'
+ '_'
+ '`'
+ 'a'
+ 'b'
+ 'c'
+ 'd'
+ 'e'
+ 'f'
+ 'g'
+ 'h'
+ 'i'
+ 'j'
+ 'k'
+ 'l'
+ 'm'
+ 'n'
+ 'o'
+ 'p'
+ 'q'
+ 'r'
+ 's'
+ 't'
+ 'u'
+ 'v'
+ 'w'
+ 'x'
+ 'y'
+ 'z'
+ '{'
+ '|'
+ '}'
+ '~'
+ '\x7f'
+ '\uf780'
+ '\uf781'
+ '\uf782'
+ '\uf783'
+ '\uf784'
+ '\uf785'
+ '\uf786'
+ '\uf787'
+ '\uf788'
+ '\uf789'
+ '\uf78a'
+ '\uf78b'
+ '\uf78c'
+ '\uf78d'
+ '\uf78e'
+ '\uf78f'
+ '\uf790'
+ '\uf791'
+ '\uf792'
+ '\uf793'
+ '\uf794'
+ '\uf795'
+ '\uf796'
+ '\uf797'
+ '\uf798'
+ '\uf799'
+ '\uf79a'
+ '\uf79b'
+ '\uf79c'
+ '\uf79d'
+ '\uf79e'
+ '\uf79f'
+ '\uf7a0'
+ '\uf7a1'
+ '\uf7a2'
+ '\uf7a3'
+ '\uf7a4'
+ '\uf7a5'
+ '\uf7a6'
+ '\uf7a7'
+ '\uf7a8'
+ '\uf7a9'
+ '\uf7aa'
+ '\uf7ab'
+ '\uf7ac'
+ '\uf7ad'
+ '\uf7ae'
+ '\uf7af'
+ '\uf7b0'
+ '\uf7b1'
+ '\uf7b2'
+ '\uf7b3'
+ '\uf7b4'
+ '\uf7b5'
+ '\uf7b6'
+ '\uf7b7'
+ '\uf7b8'
+ '\uf7b9'
+ '\uf7ba'
+ '\uf7bb'
+ '\uf7bc'
+ '\uf7bd'
+ '\uf7be'
+ '\uf7bf'
+ '\uf7c0'
+ '\uf7c1'
+ '\uf7c2'
+ '\uf7c3'
+ '\uf7c4'
+ '\uf7c5'
+ '\uf7c6'
+ '\uf7c7'
+ '\uf7c8'
+ '\uf7c9'
+ '\uf7ca'
+ '\uf7cb'
+ '\uf7cc'
+ '\uf7cd'
+ '\uf7ce'
+ '\uf7cf'
+ '\uf7d0'
+ '\uf7d1'
+ '\uf7d2'
+ '\uf7d3'
+ '\uf7d4'
+ '\uf7d5'
+ '\uf7d6'
+ '\uf7d7'
+ '\uf7d8'
+ '\uf7d9'
+ '\uf7da'
+ '\uf7db'
+ '\uf7dc'
+ '\uf7dd'
+ '\uf7de'
+ '\uf7df'
+ '\uf7e0'
+ '\uf7e1'
+ '\uf7e2'
+ '\uf7e3'
+ '\uf7e4'
+ '\uf7e5'
+ '\uf7e6'
+ '\uf7e7'
+ '\uf7e8'
+ '\uf7e9'
+ '\uf7ea'
+ '\uf7eb'
+ '\uf7ec'
+ '\uf7ed'
+ '\uf7ee'
+ '\uf7ef'
+ '\uf7f0'
+ '\uf7f1'
+ '\uf7f2'
+ '\uf7f3'
+ '\uf7f4'
+ '\uf7f5'
+ '\uf7f6'
+ '\uf7f7'
+ '\uf7f8'
+ '\uf7f9'
+ '\uf7fa'
+ '\uf7fb'
+ '\uf7fc'
+ '\uf7fd'
+ '\uf7fe'
+ '\uf7ff'
+)
+
+### Encoding table
+encoding_table = codecs.charmap_build(decoding_table)
diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
index 5ba926e3..c0be95b2 100644
--- a/html5lib/filters/alphabeticalattributes.py
+++ b/html5lib/filters/alphabeticalattributes.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from . import base
diff --git a/html5lib/filters/base.py b/html5lib/filters/base.py
index c7dbaed0..6d6639e6 100644
--- a/html5lib/filters/base.py
+++ b/html5lib/filters/base.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import, division, unicode_literals
-class Filter(object):
+class Filter:
def __init__(self, source):
self.source = source
diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
index aefb5c84..c8dc57b8 100644
--- a/html5lib/filters/inject_meta_charset.py
+++ b/html5lib/filters/inject_meta_charset.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from . import base
diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index acd4d7a2..0d47f921 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import text_type
from . import base
from ..constants import namespaces, voidElements
@@ -33,9 +30,9 @@ def __iter__(self):
if type in ("StartTag", "EmptyTag"):
namespace = token["namespace"]
name = token["name"]
- assert namespace is None or isinstance(namespace, text_type)
+ assert namespace is None or isinstance(namespace, str)
assert namespace != ""
- assert isinstance(name, text_type)
+ assert isinstance(name, str)
assert name != ""
assert isinstance(token["data"], dict)
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
@@ -45,18 +42,18 @@ def __iter__(self):
if type == "StartTag" and self.require_matching_tags:
open_elements.append((namespace, name))
for (namespace, name), value in token["data"].items():
- assert namespace is None or isinstance(namespace, text_type)
+ assert namespace is None or isinstance(namespace, str)
assert namespace != ""
- assert isinstance(name, text_type)
+ assert isinstance(name, str)
assert name != ""
- assert isinstance(value, text_type)
+ assert isinstance(value, str)
elif type == "EndTag":
namespace = token["namespace"]
name = token["name"]
- assert namespace is None or isinstance(namespace, text_type)
+ assert namespace is None or isinstance(namespace, str)
assert namespace != ""
- assert isinstance(name, text_type)
+ assert isinstance(name, str)
assert name != ""
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
@@ -66,26 +63,26 @@ def __iter__(self):
elif type == "Comment":
data = token["data"]
- assert isinstance(data, text_type)
+ assert isinstance(data, str)
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
- assert isinstance(data, text_type)
+ assert isinstance(data, str)
assert data != ""
if type == "SpaceCharacters":
assert data.strip(spaceCharacters) == ""
elif type == "Doctype":
name = token["name"]
- assert name is None or isinstance(name, text_type)
- assert token["publicId"] is None or isinstance(name, text_type)
- assert token["systemId"] is None or isinstance(name, text_type)
+ assert name is None or isinstance(name, str)
+ assert token["publicId"] is None or isinstance(name, str)
+ assert token["systemId"] is None or isinstance(name, str)
elif type == "Entity":
- assert isinstance(token["name"], text_type)
+ assert isinstance(token["name"], str)
elif type == "SerializerError":
- assert isinstance(token["data"], text_type)
+ assert isinstance(token["data"], str)
else:
assert False, "Unknown token type: %(type)s" % {"type": type}
diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
index 4a865012..a44b2a00 100644
--- a/html5lib/filters/optionaltags.py
+++ b/html5lib/filters/optionaltags.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from . import base
diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py
index ea2c5dd3..94c8602c 100644
--- a/html5lib/filters/sanitizer.py
+++ b/html5lib/filters/sanitizer.py
@@ -6,14 +6,12 @@
if Bleach is unsuitable for your needs.
"""
-from __future__ import absolute_import, division, unicode_literals
import re
import warnings
+from urllib.parse import urlparse
from xml.sax.saxutils import escape, unescape
-from six.moves import urllib_parse as urlparse
-
from . import base
from ..constants import namespaces, prefixes
@@ -846,7 +844,7 @@ def allowed_token(self, token):
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
- uri = urlparse.urlparse(val_unescaped)
+ uri = urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py
index 0d12584b..ab40ef5a 100644
--- a/html5lib/filters/whitespace.py
+++ b/html5lib/filters/whitespace.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import re
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index b3c206d1..91d71a88 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import viewkeys
-
from . import _inputstream
from . import _tokenizer
@@ -69,7 +66,7 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen
return p.parseFragment(doc, container=container, **kwargs)
-class HTMLParser(object):
+class HTMLParser:
"""HTML parser
Generates a tree structure from a stream of (possibly malformed) HTML.
@@ -397,7 +394,7 @@ def parseRCDataRawtext(self, token, contentType):
self.phase = self.phases["text"]
-class Phase(object):
+class Phase:
"""Base class for helper object that implements each phase of processing
"""
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
@@ -428,7 +425,7 @@ def processSpaceCharacters(self, token):
def processStartTag(self, token):
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
- # (CPython 2.7, 3.8) GC cost when parsing many short inputs
+ # (CPython 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
@@ -455,7 +452,7 @@ def startTagHtml(self, token):
def processEndTag(self, token):
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
- # (CPython 2.7, 3.8) GC cost when parsing many short inputs
+ # (CPython 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
@@ -2774,7 +2771,7 @@ def processEndTag(self, token):
def adjust_attributes(token, replacements):
- needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+ needs_adjustment = token['data'].keys() & replacements.keys()
if needs_adjustment:
token['data'] = type(token['data'])((replacements.get(k, k), v)
for k, v in token['data'].items())
diff --git a/html5lib/serializer.py b/html5lib/serializer.py
index a171ac1c..ed52593f 100644
--- a/html5lib/serializer.py
+++ b/html5lib/serializer.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
import re
from codecs import register_error, xmlcharrefreplace_errors
@@ -101,7 +98,7 @@ def serialize(input, tree="etree", encoding=None, **serializer_opts):
return s.render(walker(input), encoding)
-class HTMLSerializer(object):
+class HTMLSerializer:
# attribute quoting options
quote_attr_values = "legacy" # be secure by default
@@ -222,14 +219,14 @@ def __init__(self, **kwargs):
self.strict = False
def encode(self, string):
- assert isinstance(string, text_type)
+ assert isinstance(string, str)
if self.encoding:
return string.encode(self.encoding, "htmlentityreplace")
else:
return string
def encodeStrict(self, string):
- assert isinstance(string, text_type)
+ assert isinstance(string, str)
if self.encoding:
return string.encode(self.encoding, "strict")
else:
diff --git a/html5lib/tests/__init__.py b/html5lib/tests/__init__.py
index b8ce2de3..e69de29b 100644
--- a/html5lib/tests/__init__.py
+++ b/html5lib/tests/__init__.py
@@ -1 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
diff --git a/html5lib/tests/conftest.py b/html5lib/tests/conftest.py
index fffeb50c..de9b1572 100644
--- a/html5lib/tests/conftest.py
+++ b/html5lib/tests/conftest.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
import os.path
import sys
@@ -54,7 +53,7 @@ def pytest_configure(config):
# Check for optional requirements
req_file = os.path.join(_root, "requirements-optional.txt")
if os.path.exists(req_file):
- with open(req_file, "r") as fp:
+ with open(req_file) as fp:
for line in fp:
if (line.strip() and
not (line.startswith("-r") or
@@ -79,7 +78,7 @@ def pytest_configure(config):
import xml.etree.ElementTree as ElementTree
try:
- import xml.etree.cElementTree as cElementTree
+ import xml.etree.ElementTree as cElementTree
except ImportError:
msgs.append("cElementTree unable to be imported")
else:
diff --git a/html5lib/tests/sanitizer.py b/html5lib/tests/sanitizer.py
index 16e53868..93ad4f52 100644
--- a/html5lib/tests/sanitizer.py
+++ b/html5lib/tests/sanitizer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import codecs
import json
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
index 1bd0ccc1..3a6f37c2 100644
--- a/html5lib/tests/support.py
+++ b/html5lib/tests/support.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
# pylint:disable=wrong-import-position
@@ -86,7 +85,7 @@ def __getitem__(self, key):
return dict.get(self, key, self.default)
-class TestData(object):
+class TestData:
def __init__(self, filename, newTestHeading="data", encoding="utf8"):
if encoding is None:
self.f = open(filename, mode="rb")
diff --git a/html5lib/tests/test_alphabeticalattributes.py b/html5lib/tests/test_alphabeticalattributes.py
index 7d5b8e0f..87beb8f1 100644
--- a/html5lib/tests/test_alphabeticalattributes.py
+++ b/html5lib/tests/test_alphabeticalattributes.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from collections import OrderedDict
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index 47c4814a..10b666da 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import os
@@ -9,7 +8,7 @@
def test_basic_prescan_length():
- data = "Caf\u00E9".encode('utf-8')
+ data = "Caf\u00E9".encode()
pad = 1024 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 1024 # Sanity
@@ -18,7 +17,7 @@ def test_basic_prescan_length():
def test_parser_reparse():
- data = "Caf\u00E9".encode('utf-8')
+ data = "Caf\u00E9".encode()
pad = 10240 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 10240 # Sanity
diff --git a/html5lib/tests/test_meta.py b/html5lib/tests/test_meta.py
index e02268aa..2fc6140d 100644
--- a/html5lib/tests/test_meta.py
+++ b/html5lib/tests/test_meta.py
@@ -1,10 +1,4 @@
-from __future__ import absolute_import, division, unicode_literals
-
-import six
-try:
- from unittest.mock import Mock
-except ImportError:
- from mock import Mock
+from unittest.mock import Mock
from . import support
@@ -30,11 +24,7 @@ def test_errorMessage():
r = support.errorMessage(input, expected, actual)
# Assertions!
- if six.PY2:
- assert b"Input:\n1\nExpected:\n2\nReceived\n3\n" == r
- else:
- assert six.PY3
- assert "Input:\n1\nExpected:\n2\nReceived\n3\n" == r
+ assert "Input:\n1\nExpected:\n2\nReceived\n3\n" == r
assert input.__repr__.call_count == 1
assert expected.__repr__.call_count == 1
diff --git a/html5lib/tests/test_optionaltags_filter.py b/html5lib/tests/test_optionaltags_filter.py
index cd282149..180a109e 100644
--- a/html5lib/tests/test_optionaltags_filter.py
+++ b/html5lib/tests/test_optionaltags_filter.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from html5lib.filters.optionaltags import Filter
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
index 6b464bea..da76cd41 100644
--- a/html5lib/tests/test_parser2.py
+++ b/html5lib/tests/test_parser2.py
@@ -1,7 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import PY2, text_type
-
import io
from . import support # noqa
@@ -74,11 +70,6 @@ def test_debug_log():
('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
- if PY2:
- for i, log in enumerate(expected):
- log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
- expected[i] = tuple(log)
-
assert parser.log == expected
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
index 499310b6..562ee7fa 100644
--- a/html5lib/tests/test_sanitizer.py
+++ b/html5lib/tests/test_sanitizer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import warnings
diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
index a2be0be5..5c225790 100644
--- a/html5lib/tests/test_serializer.py
+++ b/html5lib/tests/test_serializer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import os
import json
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index efe9b472..0512419c 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from . import support # noqa
@@ -8,8 +7,8 @@
import pytest
-import six
-from six.moves import http_client, urllib
+import http.client
+import urllib.response
from html5lib._inputstream import (BufferedStream, HTMLInputStream,
HTMLUnicodeInputStream, HTMLBinaryInputStream)
@@ -105,7 +104,7 @@ def test_char_ascii():
def test_char_utf8():
- stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
+ stream = HTMLInputStream('\u2018'.encode(), override_encoding='utf-8')
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == '\u2018'
@@ -186,12 +185,12 @@ def test_python_issue_20007():
Make sure we have a work-around for Python bug #20007
http://bugs.python.org/issue20007
"""
- class FakeSocket(object):
+ class FakeSocket:
def makefile(self, _mode, _bufsize=None):
# pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
- source = http_client.HTTPResponse(FakeSocket())
+ source = http.client.HTTPResponse(FakeSocket())
source.begin()
stream = HTMLInputStream(source)
assert stream.charsUntil(" ") == "Text"
@@ -202,15 +201,12 @@ def test_python_issue_20007_b():
Make sure we have a work-around for Python bug #20007
http://bugs.python.org/issue20007
"""
- if six.PY2:
- return
-
- class FakeSocket(object):
+ class FakeSocket:
def makefile(self, _mode, _bufsize=None):
# pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
- source = http_client.HTTPResponse(FakeSocket())
+ source = http.client.HTTPResponse(FakeSocket())
source.begin()
wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
stream = HTMLInputStream(wrapped)
diff --git a/html5lib/tests/test_tokenizer2.py b/html5lib/tests/test_tokenizer2.py
index 158d847a..4e993571 100644
--- a/html5lib/tests/test_tokenizer2.py
+++ b/html5lib/tests/test_tokenizer2.py
@@ -1,9 +1,6 @@
-from __future__ import absolute_import, division, unicode_literals
import io
-from six import unichr, text_type
-
from html5lib._tokenizer import HTMLTokenizer
from html5lib.constants import tokenTypes
@@ -16,7 +13,7 @@ def ignore_parse_errors(toks):
def test_maintain_attribute_order():
# generate loads to maximize the chance a hash-based mutation will occur
- attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
+ attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
stream = io.StringIO("")
toks = HTMLTokenizer(stream)
@@ -49,7 +46,7 @@ def test_duplicate_attribute():
def test_maintain_duplicate_attribute_order():
# generate loads to maximize the chance a hash-based mutation will occur
- attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
+ attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
stream = io.StringIO("")
toks = HTMLTokenizer(stream)
diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py
index 95e56c00..3af383c3 100644
--- a/html5lib/tests/test_treeadapters.py
+++ b/html5lib/tests/test_treeadapters.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from . import support # noqa
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 780ca964..22ee0cb7 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -1,9 +1,7 @@
-from __future__ import absolute_import, division, unicode_literals
import itertools
import sys
-from six import unichr, text_type
import pytest
try:
@@ -74,11 +72,11 @@ def param_treewalker_six_mix():
# fragment but not using the u'' syntax nor importing unicode_literals
sm_tests = [
('Example',
- [(str('class'), str('test123'))],
+ [('class', 'test123')],
'\n class="test123"\n href="http://example.com"\n "Example"'),
('',
- [(str('rel'), str('alternate'))],
+ [('rel', 'alternate')],
'\n href="http://example.com/cow"\n rel="alternate"\n "Example"')
]
@@ -151,7 +149,7 @@ def test_maintain_attribute_order(treeName):
pytest.skip("Treebuilder not loaded")
# generate loads to maximize the chance a hash-based mutation will occur
- attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
+ attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
data = ""
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
diff --git a/html5lib/tests/test_whitespace_filter.py b/html5lib/tests/test_whitespace_filter.py
index e9da6140..d4e4e3be 100644
--- a/html5lib/tests/test_whitespace_filter.py
+++ b/html5lib/tests/test_whitespace_filter.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from html5lib.filters.whitespace import Filter
from html5lib.constants import spaceCharacters
diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py
index b49d2e6e..d2605a12 100644
--- a/html5lib/tests/tokenizer.py
+++ b/html5lib/tests/tokenizer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import codecs
import json
@@ -6,13 +5,12 @@
import re
import pytest
-from six import unichr
from html5lib._tokenizer import HTMLTokenizer
from html5lib import constants, _utils
-class TokenizerTestParser(object):
+class TokenizerTestParser:
def __init__(self, initialState, lastStartTag=None):
self.tokenizer = HTMLTokenizer
self._state = initialState
@@ -146,15 +144,15 @@ def repl(m):
low = int(m.group(2), 16)
if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
- return unichr(cp)
+ return chr(cp)
else:
- return unichr(high) + unichr(low)
+ return chr(high) + chr(low)
else:
- return unichr(int(m.group(1), 16))
+ return chr(int(m.group(1), 16))
try:
return _surrogateRe.sub(repl, inp)
except ValueError:
- # This occurs when unichr throws ValueError, which should
+ # This occurs when chr throws ValueError, which should
# only be for a lone-surrogate.
if _utils.supports_lone_surrogates:
raise
diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py
index 42463f32..6c0b4f77 100644
--- a/html5lib/tests/tokenizertotree.py
+++ b/html5lib/tests/tokenizertotree.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import sys
import os
@@ -25,7 +24,7 @@ def main(out_path):
def run_file(filename, out_path):
try:
- tests_data = json.load(open(filename, "r"))
+ tests_data = json.load(open(filename))
except ValueError:
sys.stderr.write("Failed to load %s\n" % filename)
return
diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py
index 363b48c2..e2381754 100644
--- a/html5lib/tests/tree_construction.py
+++ b/html5lib/tests/tree_construction.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
import itertools
import re
diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py
index dfeb0ba5..1444fc9a 100644
--- a/html5lib/treeadapters/__init__.py
+++ b/html5lib/treeadapters/__init__.py
@@ -16,7 +16,6 @@
genshi_tree = genshi.to_genshi(TreeWalker(tree))
"""
-from __future__ import absolute_import, division, unicode_literals
from . import sax
diff --git a/html5lib/treeadapters/genshi.py b/html5lib/treeadapters/genshi.py
index 61d5fb6a..b0b29ed3 100644
--- a/html5lib/treeadapters/genshi.py
+++ b/html5lib/treeadapters/genshi.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName, Attrs
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py
index f4ccea5a..ead1a5c4 100644
--- a/html5lib/treeadapters/sax.py
+++ b/html5lib/treeadapters/sax.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from xml.sax.xmlreader import AttributesNSImpl
diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py
index d44447ea..90aad5fb 100644
--- a/html5lib/treebuilders/__init__.py
+++ b/html5lib/treebuilders/__init__.py
@@ -29,7 +29,6 @@
"""
-from __future__ import absolute_import, division, unicode_literals
from .._utils import default_etree
diff --git a/html5lib/treebuilders/base.py b/html5lib/treebuilders/base.py
index 020d7e15..3fec12c4 100644
--- a/html5lib/treebuilders/base.py
+++ b/html5lib/treebuilders/base.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
from ..constants import scopingElements, tableInsertModeElements, namespaces
# The scope markers are inserted when entering object elements,
@@ -20,7 +17,7 @@
}
-class Node(object):
+class Node:
"""Represents an item in the tree"""
def __init__(self, name):
"""Creates a Node
@@ -144,7 +141,7 @@ def nodesEqual(self, node1, node2):
return True
-class TreeBuilder(object):
+class TreeBuilder:
"""Base treebuilder implementation
* documentClass - the class to use for the bottommost node of a document
@@ -200,7 +197,7 @@ def elementInScope(self, target, variant=None):
# match any node with that name
exactNode = hasattr(target, "nameTuple")
if not exactNode:
- if isinstance(target, text_type):
+ if isinstance(target, str):
target = (namespaces["html"], target)
assert isinstance(target, tuple)
@@ -323,7 +320,7 @@ def _setInsertFromTable(self, value):
def insertElementNormal(self, token):
name = token["name"]
- assert isinstance(name, text_type), "Element %s not unicode" % name
+ assert isinstance(name, str), "Element %s not unicode" % name
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index d8b53004..bc56c708 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -1,10 +1,6 @@
-from __future__ import absolute_import, division, unicode_literals
-try:
- from collections.abc import MutableMapping
-except ImportError: # Python 2.7
- from collections import MutableMapping
+from collections.abc import MutableMapping
from xml.dom import minidom, Node
import weakref
diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py
index 0b745081..f9564fe0 100644
--- a/html5lib/treebuilders/etree.py
+++ b/html5lib/treebuilders/etree.py
@@ -1,8 +1,5 @@
-from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
-from six import text_type
-
import re
from copy import copy
@@ -222,7 +219,7 @@ def serializeElement(element, indent=0):
elif element.tag == ElementTreeCommentType:
rv.append("|%s" % (' ' * indent, element.text))
else:
- assert isinstance(element.tag, text_type), \
+ assert isinstance(element.tag, str), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
nsmatch = tag_regexp.match(element.tag)
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index e73de61a..b0be4617 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -9,17 +9,13 @@
When any of these things occur, we emit a DataLossWarning
"""
-from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings
import re
import sys
-try:
- from collections.abc import MutableMapping
-except ImportError:
- from collections import MutableMapping
+from collections.abc import MutableMapping
from . import base
from ..constants import DataLossWarning
@@ -28,7 +24,6 @@
from .. import _ihatexml
import lxml.etree as etree
-from six import PY3, binary_type
fullTree = True
@@ -37,14 +32,14 @@
comment_type = etree.Comment("asd").tag
-class DocumentType(object):
+class DocumentType:
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
-class Document(object):
+class Document:
def __init__(self):
self._elementTree = None
self._childNodes = []
@@ -208,8 +203,6 @@ def _coerceKey(self, key):
def __getitem__(self, key):
value = self._element._element.attrib[self._coerceKey(key)]
- if not PY3 and isinstance(value, binary_type):
- value = value.decode("ascii")
return value
def __setitem__(self, key, value):
diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py
index b2d3aac3..b78d6f46 100644
--- a/html5lib/treewalkers/__init__.py
+++ b/html5lib/treewalkers/__init__.py
@@ -8,7 +8,6 @@
returns an iterator which generates tokens.
"""
-from __future__ import absolute_import, division, unicode_literals
from .. import constants
from .._utils import default_etree
diff --git a/html5lib/treewalkers/base.py b/html5lib/treewalkers/base.py
index 80c474c4..7ee75d81 100644
--- a/html5lib/treewalkers/base.py
+++ b/html5lib/treewalkers/base.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
from ..constants import namespaces, voidElements, spaceCharacters
@@ -17,7 +16,7 @@
spaceCharacters = "".join(spaceCharacters)
-class TreeWalker(object):
+class TreeWalker:
"""Walks a tree yielding tokens
Tokens are dicts that all have a ``type`` field specifying the type of the
diff --git a/html5lib/treewalkers/dom.py b/html5lib/treewalkers/dom.py
index b0c89b00..85e12505 100644
--- a/html5lib/treewalkers/dom.py
+++ b/html5lib/treewalkers/dom.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py
index 411a1d45..41607f52 100644
--- a/html5lib/treewalkers/etree.py
+++ b/html5lib/treewalkers/etree.py
@@ -1,10 +1,7 @@
-from __future__ import absolute_import, division, unicode_literals
from collections import OrderedDict
import re
-from six import string_types
-
from . import base
from .._utils import moduleFactoryFactory
@@ -51,7 +48,7 @@ def getNodeDetails(self, node):
return base.COMMENT, node.text
else:
- assert isinstance(node.tag, string_types), type(node.tag)
+ assert isinstance(node.tag, str), type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
diff --git a/html5lib/treewalkers/etree_lxml.py b/html5lib/treewalkers/etree_lxml.py
index a614ac5b..0ec633ac 100644
--- a/html5lib/treewalkers/etree_lxml.py
+++ b/html5lib/treewalkers/etree_lxml.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
from collections import OrderedDict
from lxml import etree
@@ -14,13 +11,13 @@
def ensure_str(s):
if s is None:
return None
- elif isinstance(s, text_type):
+ elif isinstance(s, str):
return s
else:
return s.decode("ascii", "strict")
-class Root(object):
+class Root:
def __init__(self, et):
self.elementtree = et
self.children = []
@@ -58,7 +55,7 @@ def __len__(self):
return 1
-class Doctype(object):
+class Doctype:
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
@@ -81,7 +78,7 @@ def getnext(self):
return None
-class FragmentWrapper(object):
+class FragmentWrapper:
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
diff --git a/html5lib/treewalkers/genshi.py b/html5lib/treewalkers/genshi.py
index 7483be27..78f22fd3 100644
--- a/html5lib/treewalkers/genshi.py
+++ b/html5lib/treewalkers/genshi.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
diff --git a/parse.py b/parse.py
index e6806b46..14bbe99a 100755
--- a/parse.py
+++ b/parse.py
@@ -42,7 +42,7 @@ def parse():
try:
# Try opening from file system
f = open(f, "rb")
- except IOError as e:
+ except OSError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
diff --git a/requirements-oldest.txt b/requirements-oldest.txt
index 68d0f13d..62d203ae 100644
--- a/requirements-oldest.txt
+++ b/requirements-oldest.txt
@@ -1,8 +1,7 @@
# This allows us to install the actually oldest supported dependencies and test whether that works.
# requirements.txt
-six==1.9
-webencodings==0.5.1
+# (nothing)
# requirements-optional.txt
genshi==0.7.1 ; python_version < '3.8'
@@ -26,4 +25,4 @@ pytest==5.4.2 ; python_version >= '3'
coverage==5.1
pytest-expect==1.1.0
mock==3.0.5 ; python_version < '3.6'
-mock==4.0.2 ; python_version >= '3.6'
\ No newline at end of file
+mock==4.0.2 ; python_version >= '3.6'
diff --git a/requirements-test.txt b/requirements-test.txt
index aca31f5e..1415d163 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -6,5 +6,6 @@ pytest>=4.6.10,<5 ; python_version < '3'
pytest>=5.4.2,<8 ; python_version >= '3'
coverage>=5.1,<6
pytest-expect>=1.1.0,<2
+six>=1.9 # required by pytest-expect
mock>=3.0.5,<4 ; python_version < '3.3'
setuptools; python_version >= '3.12'
diff --git a/requirements.txt b/requirements.txt
index ae7ec3d0..e69de29b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +0,0 @@
-six>=1.9
-webencodings
diff --git a/setup.py b/setup.py
index 30ee0575..638997c8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
import ast
import codecs
@@ -64,11 +63,7 @@ def default_environment():
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
- 'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.5',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
@@ -106,11 +101,8 @@ def default_environment():
maintainer='James Graham',
maintainer_email='james@hoppipolla.co.uk',
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
- install_requires=[
- 'six>=1.9',
- 'webencodings>=0.5.1',
- ],
- python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*",
+ install_requires=[],
+ python_requires=">=3.8",
extras_require={
# A conditional extra will only install these items when the extra is
# requested and the condition matches.
diff --git a/tox.ini b/tox.ini
index fb228e96..94a78542 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
[tox]
-envlist = py{27,35,36,37,38,39,310,311,py,py3}-{base,optional,oldest}
+envlist = py{38,39,310,311,py,py3}-{base,optional,oldest}
[testenv]
deps =
diff --git a/toxver.py b/toxver.py
index 68eb71ec..950dc083 100755
--- a/toxver.py
+++ b/toxver.py
@@ -12,18 +12,11 @@
$ toxver.py pypy-3.8 base
TOXENV=pypy3-base
- $ toxver.py 2.7 oldest
- TOXENV=py27-oldest
-
$ toxver.py ~3.12.0-0 optional
TOXENV=py312-optional
"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
import sys
@@ -35,10 +28,6 @@ def main(argv):
deps = argv[2]
- if argv[1].startswith("pypy-2"):
- print("TOXENV=pypy-" + deps)
- return 0
-
if argv[1].startswith("pypy-3"):
print("TOXENV=pypy3-" + deps)
return 0