diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index e6f7bf48..00000000 --- a/.appveyor.yml +++ /dev/null @@ -1,29 +0,0 @@ -# appveyor.yml - https://www.appveyor.com/docs/lang/python -# https://www.appveyor.com/docs/windows-images-software/#visual-studio-2022 ---- -image: Visual Studio 2022 -environment: - matrix: - - PY_PYTHON: 2.7 - TOXENV: py27-base - - PY_PYTHON: 2.7 - TOXENV: py27-optional - - PY_PYTHON: 3.7 - TOXENV: py37-base - - PY_PYTHON: 3.7 - TOXENV: py37-optional - -install: - - git submodule update --init --recursive - - py --list - - py -VV - - py -m pip install --upgrade pip - - py -m pip install tox - -build: off - -test_script: - - py -m tox - -after_test: - - py debug-info.py diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml index 5ed83175..0912abb3 100644 --- a/.github/workflows/python-tox.yml +++ b/.github/workflows/python-tox.yml @@ -12,9 +12,6 @@ jobs: os: [ubuntu-latest, windows-latest] deps: [base, optional] include: - - python: "pypy-2.7" - os: ubuntu-latest - deps: base - python: "pypy-3.10" os: ubuntu-latest deps: base diff --git a/README.rst b/README.rst index 6a623a43..befc7aaa 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,7 @@ or: By default, the ``document`` will be an ``xml.etree`` element instance. Whenever possible, html5lib chooses the accelerated ``ElementTree`` -implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x). +implementation. Two other tree types are supported: ``xml.dom.minidom`` and ``lxml.etree``. To use an alternative format, specify the name of @@ -41,18 +41,6 @@ a treebuilder: with open("mydocument.html", "rb") as f: lxml_etree_document = html5lib.parse(f, treebuilder="lxml") -When using with ``urllib2`` (Python 2), the charset from HTTP should be -pass into html5lib as follows: - -.. code-block:: python - - from contextlib import closing - from urllib2 import urlopen - import html5lib - - with closing(urlopen("http://example.com/")) as f: - document = html5lib.parse(f, transport_encoding=f.info().getparam("charset")) - When using with ``urllib.request`` (Python 3), the charset from HTTP should be pass into html5lib as follows: @@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/. Installation ------------ -html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install: +html5lib works on CPython 3.8+ and PyPy. To install: .. code-block:: bash diff --git a/debug-info.py b/debug-info.py index b47b8ebf..5523067c 100644 --- a/debug-info.py +++ b/debug-info.py @@ -1,4 +1,3 @@ -from __future__ import print_function, unicode_literals import platform import sys @@ -12,7 +11,7 @@ "maxsize": sys.maxsize } -search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"] +search_modules = ["chardet", "genshi", "html5lib", "lxml"] found_modules = [] for m in search_modules: diff --git a/doc/conf.py b/doc/conf.py index d5a1e863..66defcce 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # html5lib documentation build configuration file, created by # sphinx-quickstart on Wed May 8 00:04:49 2013. @@ -100,7 +99,7 @@ } -class CExtMock(object): +class CExtMock: """Required for autodoc on readthedocs.org where you cannot build C extensions.""" def __init__(self, *args, **kwargs): pass diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 7b854f99..d2c68855 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -20,7 +20,6 @@ * :func:`~.serializer.serialize` """ -from __future__ import absolute_import, division, unicode_literals from .html5parser import HTMLParser, parse, parseFragment from .treebuilders import getTreeBuilder diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py index d725eabd..f5b6e1f4 100644 --- a/html5lib/_ihatexml.py +++ b/html5lib/_ihatexml.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import re import warnings @@ -181,7 +180,7 @@ def escapeRegexp(string): nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]") -class InfosetFilter(object): +class InfosetFilter: replacementRegexp = re.compile(r"U[\dA-F]{5,5}") def __init__(self, diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py index a93b5a4e..abbc3d7a 100644 --- a/html5lib/_inputstream.py +++ b/html5lib/_inputstream.py @@ -1,13 +1,12 @@ -from __future__ import absolute_import, division, unicode_literals -from six import text_type -from six.moves import http_client, urllib +import http.client +import urllib.response import codecs import re from io import BytesIO, StringIO -import webencodings +from .contrib import webencodings from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import _ReparseException @@ -48,7 +47,7 @@ charsUntilRegEx = {} -class BufferedStream(object): +class BufferedStream: """Buffering for streams that do not have buffering of their own The buffer is implemented as a list of chunks on the assumption that @@ -125,15 +124,15 @@ def _readFromBuffer(self, bytes): def HTMLInputStream(source, **kwargs): # Work around Python bug #20007: read(0) closes the connection. # http://bugs.python.org/issue20007 - if (isinstance(source, http_client.HTTPResponse) or + if (isinstance(source, http.client.HTTPResponse) or # Also check for addinfourl wrapping HTTPResponse (isinstance(source, urllib.response.addbase) and - isinstance(source.fp, http_client.HTTPResponse))): + isinstance(source.fp, http.client.HTTPResponse))): isUnicode = False elif hasattr(source, "read"): - isUnicode = isinstance(source.read(0), text_type) + isUnicode = isinstance(source.read(0), str) else: - isUnicode = isinstance(source, text_type) + isUnicode = isinstance(source, str) if isUnicode: encodings = [x for x in kwargs if x.endswith("_encoding")] @@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs): return HTMLBinaryInputStream(source, **kwargs) -class HTMLUnicodeInputStream(object): +class HTMLUnicodeInputStream: """Provides a unicode stream of characters to the HTMLTokenizer. This class takes care of character encoding and removing or replacing @@ -673,7 +672,7 @@ def jumpTo(self, bytes): return True -class EncodingParser(object): +class EncodingParser: """Mini parser for detecting character encoding from meta elements""" def __init__(self, data): @@ -861,7 +860,7 @@ def getAttribute(self): attrValue.append(c) -class ContentAttrParser(object): +class ContentAttrParser: def __init__(self, data): assert isinstance(data, bytes) self.data = data diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 4748a197..75dab441 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals - -from six import unichr as chr from collections import deque, OrderedDict from sys import version_info @@ -24,7 +21,7 @@ attributeMap = OrderedDict -class HTMLTokenizer(object): +class HTMLTokenizer: """ This class takes care of tokenizing HTML. * self.currentToken diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py index 07bad5d3..df8912a0 100644 --- a/html5lib/_trie/__init__.py +++ b/html5lib/_trie/__init__.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from .py import Trie diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py index 6b71975f..63927ee4 100644 --- a/html5lib/_trie/_base.py +++ b/html5lib/_trie/_base.py @@ -1,9 +1,5 @@ -from __future__ import absolute_import, division, unicode_literals -try: - from collections.abc import Mapping -except ImportError: # Python 2.7 - from collections import Mapping +from collections.abc import Mapping class Trie(Mapping): diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py index c2ba3da7..bc6363c4 100644 --- a/html5lib/_trie/py.py +++ b/html5lib/_trie/py.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals -from six import text_type - from bisect import bisect_left from ._base import Trie as ABCTrie @@ -8,7 +5,7 @@ class Trie(ABCTrie): def __init__(self, data): - if not all(isinstance(x, text_type) for x in data.keys()): + if not all(isinstance(x, str) for x in data.keys()): raise TypeError("All keys must be strings") self._data = data diff --git a/html5lib/_utils.py b/html5lib/_utils.py index 7e23ee57..5853e81d 100644 --- a/html5lib/_utils.py +++ b/html5lib/_utils.py @@ -1,21 +1,9 @@ -from __future__ import absolute_import, division, unicode_literals from types import ModuleType -try: - from collections.abc import Mapping -except ImportError: - from collections import Mapping - -from six import text_type, PY3 +from collections.abc import Mapping -if PY3: - import xml.etree.ElementTree as default_etree -else: - try: - import xml.etree.cElementTree as default_etree - except ImportError: - import xml.etree.ElementTree as default_etree +import xml.etree.ElementTree as default_etree __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", @@ -31,10 +19,10 @@ # escapes. try: _x = eval('"\\uD800"') # pylint:disable=eval-used - if not isinstance(_x, text_type): + if not isinstance(_x, str): # We need this with u"" because of http://bugs.jython.org/issue2039 _x = eval('u"\\uD800"') # pylint:disable=eval-used - assert isinstance(_x, text_type) + assert isinstance(_x, str) except Exception: supports_lone_surrogates = False else: @@ -122,7 +110,7 @@ def moduleFactoryFactory(factory): moduleCache = {} def moduleFactory(baseModule, *args, **kwargs): - if isinstance(ModuleType.__name__, type("")): + if isinstance(ModuleType.__name__, str): name = "_%s_factory" % baseModule.__name__ else: name = b"_%s_factory" % baseModule.__name__ diff --git a/html5lib/constants.py b/html5lib/constants.py index 2fa4146d..a4b1efa1 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import string diff --git a/html5lib/contrib/__init__.py b/html5lib/contrib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/html5lib/contrib/webencodings/LiCENSE b/html5lib/contrib/webencodings/LiCENSE new file mode 100644 index 00000000..3d0d3e70 --- /dev/null +++ b/html5lib/contrib/webencodings/LiCENSE @@ -0,0 +1,31 @@ +Copyright (c) 2012 by Simon Sapin. + +Some rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * The names of the contributors may not be used to endorse or + promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/html5lib/contrib/webencodings/__init__.py b/html5lib/contrib/webencodings/__init__.py new file mode 100644 index 00000000..2d7bf148 --- /dev/null +++ b/html5lib/contrib/webencodings/__init__.py @@ -0,0 +1,340 @@ +# coding: utf-8 +""" + + webencodings + ~~~~~~~~~~~~ + + This is a Python implementation of the `WHATWG Encoding standard + `. See README for details. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +import codecs + +from .labels import LABELS + + +VERSION = '0.6-dev' + + +# Some names in Encoding are not valid Python aliases. Remap these. +PYTHON_NAMES = { + 'iso-8859-8-i': 'iso-8859-8', + 'x-mac-cyrillic': 'mac-cyrillic', + 'macintosh': 'mac-roman', + 'windows-874': 'cp874'} + +CACHE = {} + + +def ascii_lower(string): + r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. + + :param string: A Unicode string. + :returns: A new Unicode string. + + This is used for `ASCII case-insensitive + `_ + matching of encoding labels. + The same matching is also used, among other things, + for `CSS keywords `_. + + This is different from the :meth:`~py:str.lower` method of Unicode strings + which also affect non-ASCII characters, + sometimes mapping them into the ASCII range: + + >>> keyword = u'Bac\N{KELVIN SIGN}ground' + >>> assert keyword.lower() == u'background' + >>> assert ascii_lower(keyword) != keyword.lower() + >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' + + """ + # This turns out to be faster than unicode.translate() + return string.encode('utf8').lower().decode('utf8') + + +def lookup(label): + """ + Look for an encoding by its label. + This is the spec’s `get an encoding + `_ algorithm. + Supported labels are listed there. + + :param label: A string. + :returns: + An :class:`Encoding` object, or :obj:`None` for an unknown label. + + """ + # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. + label = ascii_lower(label.strip('\t\n\f\r ')) + name = LABELS.get(label) + if name is None: + return None + encoding = CACHE.get(name) + if encoding is None: + if name == 'x-user-defined': + from .x_user_defined import codec_info + else: + python_name = PYTHON_NAMES.get(name, name) + # Any python_name value that gets to here should be valid. + codec_info = codecs.lookup(python_name) + encoding = Encoding(name, codec_info) + CACHE[name] = encoding + return encoding + + +def _get_encoding(encoding_or_label): + """ + Accept either an encoding object or label. + + :param encoding: An :class:`Encoding` object or a label string. + :returns: An :class:`Encoding` object. + :raises: :exc:`~exceptions.LookupError` for an unknown label. + + """ + if hasattr(encoding_or_label, 'codec_info'): + return encoding_or_label + + encoding = lookup(encoding_or_label) + if encoding is None: + raise LookupError('Unknown encoding label: %r' % encoding_or_label) + return encoding + + +class Encoding(object): + """Reresents a character encoding such as UTF-8, + that can be used for decoding or encoding. + + .. attribute:: name + + Canonical name of the encoding + + .. attribute:: codec_info + + The actual implementation of the encoding, + a stdlib :class:`~codecs.CodecInfo` object. + See :func:`codecs.register`. + + """ + def __init__(self, name, codec_info): + self.name = name + self.codec_info = codec_info + + def __repr__(self): + return '' % self.name + + +#: The UTF-8 encoding. Should be used for new content and formats. +UTF8 = lookup('utf-8') + +_UTF16LE = lookup('utf-16le') +_UTF16BE = lookup('utf-16be') + + +def decode(input, fallback_encoding, errors='replace'): + """ + Decode a single string. + + :param input: A byte string + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :return: + A ``(output, encoding)`` tuple of an Unicode string + and an :obj:`Encoding`. + + """ + # Fail early if `encoding` is an invalid label. + fallback_encoding = _get_encoding(fallback_encoding) + bom_encoding, input = _detect_bom(input) + encoding = bom_encoding or fallback_encoding + return encoding.codec_info.decode(input, errors)[0], encoding + + +def _detect_bom(input): + """Return (bom_encoding, input), with any BOM removed from the input.""" + if input.startswith(b'\xFF\xFE'): + return _UTF16LE, input[2:] + if input.startswith(b'\xFE\xFF'): + return _UTF16BE, input[2:] + if input.startswith(b'\xEF\xBB\xBF'): + return UTF8, input[3:] + return None, input + + +def encode(input, encoding=UTF8, errors='strict'): + """ + Encode a single string. + + :param input: An Unicode string. + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :return: A byte string. + + """ + return _get_encoding(encoding).codec_info.encode(input, errors)[0] + + +def iter_decode(input, fallback_encoding, errors='replace'): + """ + "Pull"-based decoder. + + :param input: + An iterable of byte strings. + + The input is first consumed just enough to determine the encoding + based on the precense of a BOM, + then consumed on demand when the return value is. + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :returns: + An ``(output, encoding)`` tuple. + :obj:`output` is an iterable of Unicode strings, + :obj:`encoding` is the :obj:`Encoding` that is being used. + + """ + + decoder = IncrementalDecoder(fallback_encoding, errors) + generator = _iter_decode_generator(input, decoder) + encoding = next(generator) + return generator, encoding + + +def _iter_decode_generator(input, decoder): + """Return a generator that first yields the :obj:`Encoding`, + then yields output chukns as Unicode strings. + + """ + decode = decoder.decode + input = iter(input) + for chunck in input: + output = decode(chunck) + if output: + assert decoder.encoding is not None + yield decoder.encoding + yield output + break + else: + # Input exhausted without determining the encoding + output = decode(b'', final=True) + assert decoder.encoding is not None + yield decoder.encoding + if output: + yield output + return + + for chunck in input: + output = decode(chunck) + if output: + yield output + output = decode(b'', final=True) + if output: + yield output + + +def iter_encode(input, encoding=UTF8, errors='strict'): + """ + “Pull”-based encoder. + + :param input: An iterable of Unicode strings. + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :returns: An iterable of byte strings. + + """ + # Fail early if `encoding` is an invalid label. + encode = IncrementalEncoder(encoding, errors).encode + return _iter_encode_generator(input, encode) + + +def _iter_encode_generator(input, encode): + for chunck in input: + output = encode(chunck) + if output: + yield output + output = encode('', final=True) + if output: + yield output + + +class IncrementalDecoder(object): + """ + “Push”-based decoder. + + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + + """ + def __init__(self, fallback_encoding, errors='replace'): + # Fail early if `encoding` is an invalid label. + self._fallback_encoding = _get_encoding(fallback_encoding) + self._errors = errors + self._buffer = b'' + self._decoder = None + #: The actual :class:`Encoding` that is being used, + #: or :obj:`None` if that is not determined yet. + #: (Ie. if there is not enough input yet to determine + #: if there is a BOM.) + self.encoding = None # Not known yet. + + def decode(self, input, final=False): + """Decode one chunk of the input. + + :param input: A byte string. + :param final: + Indicate that no more input is available. + Must be :obj:`True` if this is the last call. + :returns: An Unicode string. + + """ + decoder = self._decoder + if decoder is not None: + return decoder(input, final) + + input = self._buffer + input + encoding, input = _detect_bom(input) + if encoding is None: + if len(input) < 3 and not final: # Not enough data yet. + self._buffer = input + return '' + else: # No BOM + encoding = self._fallback_encoding + decoder = encoding.codec_info.incrementaldecoder(self._errors).decode + self._decoder = decoder + self.encoding = encoding + return decoder(input, final) + + +class IncrementalEncoder(object): + """ + “Push”-based encoder. + + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + + .. method:: encode(input, final=False) + + :param input: An Unicode string. + :param final: + Indicate that no more input is available. + Must be :obj:`True` if this is the last call. + :returns: A byte string. + + """ + def __init__(self, encoding=UTF8, errors='strict'): + encoding = _get_encoding(encoding) + self.encode = encoding.codec_info.incrementalencoder(errors).encode diff --git a/html5lib/contrib/webencodings/labels.py b/html5lib/contrib/webencodings/labels.py new file mode 100644 index 00000000..29cbf91e --- /dev/null +++ b/html5lib/contrib/webencodings/labels.py @@ -0,0 +1,231 @@ +""" + + webencodings.labels + ~~~~~~~~~~~~~~~~~~~ + + Map encoding labels to their name. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +# XXX Do not edit! +# This file is automatically generated by mklabels.py + +LABELS = { + 'unicode-1-1-utf-8': 'utf-8', + 'utf-8': 'utf-8', + 'utf8': 'utf-8', + '866': 'ibm866', + 'cp866': 'ibm866', + 'csibm866': 'ibm866', + 'ibm866': 'ibm866', + 'csisolatin2': 'iso-8859-2', + 'iso-8859-2': 'iso-8859-2', + 'iso-ir-101': 'iso-8859-2', + 'iso8859-2': 'iso-8859-2', + 'iso88592': 'iso-8859-2', + 'iso_8859-2': 'iso-8859-2', + 'iso_8859-2:1987': 'iso-8859-2', + 'l2': 'iso-8859-2', + 'latin2': 'iso-8859-2', + 'csisolatin3': 'iso-8859-3', + 'iso-8859-3': 'iso-8859-3', + 'iso-ir-109': 'iso-8859-3', + 'iso8859-3': 'iso-8859-3', + 'iso88593': 'iso-8859-3', + 'iso_8859-3': 'iso-8859-3', + 'iso_8859-3:1988': 'iso-8859-3', + 'l3': 'iso-8859-3', + 'latin3': 'iso-8859-3', + 'csisolatin4': 'iso-8859-4', + 'iso-8859-4': 'iso-8859-4', + 'iso-ir-110': 'iso-8859-4', + 'iso8859-4': 'iso-8859-4', + 'iso88594': 'iso-8859-4', + 'iso_8859-4': 'iso-8859-4', + 'iso_8859-4:1988': 'iso-8859-4', + 'l4': 'iso-8859-4', + 'latin4': 'iso-8859-4', + 'csisolatincyrillic': 'iso-8859-5', + 'cyrillic': 'iso-8859-5', + 'iso-8859-5': 'iso-8859-5', + 'iso-ir-144': 'iso-8859-5', + 'iso8859-5': 'iso-8859-5', + 'iso88595': 'iso-8859-5', + 'iso_8859-5': 'iso-8859-5', + 'iso_8859-5:1988': 'iso-8859-5', + 'arabic': 'iso-8859-6', + 'asmo-708': 'iso-8859-6', + 'csiso88596e': 'iso-8859-6', + 'csiso88596i': 'iso-8859-6', + 'csisolatinarabic': 'iso-8859-6', + 'ecma-114': 'iso-8859-6', + 'iso-8859-6': 'iso-8859-6', + 'iso-8859-6-e': 'iso-8859-6', + 'iso-8859-6-i': 'iso-8859-6', + 'iso-ir-127': 'iso-8859-6', + 'iso8859-6': 'iso-8859-6', + 'iso88596': 'iso-8859-6', + 'iso_8859-6': 'iso-8859-6', + 'iso_8859-6:1987': 'iso-8859-6', + 'csisolatingreek': 'iso-8859-7', + 'ecma-118': 'iso-8859-7', + 'elot_928': 'iso-8859-7', + 'greek': 'iso-8859-7', + 'greek8': 'iso-8859-7', + 'iso-8859-7': 'iso-8859-7', + 'iso-ir-126': 'iso-8859-7', + 'iso8859-7': 'iso-8859-7', + 'iso88597': 'iso-8859-7', + 'iso_8859-7': 'iso-8859-7', + 'iso_8859-7:1987': 'iso-8859-7', + 'sun_eu_greek': 'iso-8859-7', + 'csiso88598e': 'iso-8859-8', + 'csisolatinhebrew': 'iso-8859-8', + 'hebrew': 'iso-8859-8', + 'iso-8859-8': 'iso-8859-8', + 'iso-8859-8-e': 'iso-8859-8', + 'iso-ir-138': 'iso-8859-8', + 'iso8859-8': 'iso-8859-8', + 'iso88598': 'iso-8859-8', + 'iso_8859-8': 'iso-8859-8', + 'iso_8859-8:1988': 'iso-8859-8', + 'visual': 'iso-8859-8', + 'csiso88598i': 'iso-8859-8-i', + 'iso-8859-8-i': 'iso-8859-8-i', + 'logical': 'iso-8859-8-i', + 'csisolatin6': 'iso-8859-10', + 'iso-8859-10': 'iso-8859-10', + 'iso-ir-157': 'iso-8859-10', + 'iso8859-10': 'iso-8859-10', + 'iso885910': 'iso-8859-10', + 'l6': 'iso-8859-10', + 'latin6': 'iso-8859-10', + 'iso-8859-13': 'iso-8859-13', + 'iso8859-13': 'iso-8859-13', + 'iso885913': 'iso-8859-13', + 'iso-8859-14': 'iso-8859-14', + 'iso8859-14': 'iso-8859-14', + 'iso885914': 'iso-8859-14', + 'csisolatin9': 'iso-8859-15', + 'iso-8859-15': 'iso-8859-15', + 'iso8859-15': 'iso-8859-15', + 'iso885915': 'iso-8859-15', + 'iso_8859-15': 'iso-8859-15', + 'l9': 'iso-8859-15', + 'iso-8859-16': 'iso-8859-16', + 'cskoi8r': 'koi8-r', + 'koi': 'koi8-r', + 'koi8': 'koi8-r', + 'koi8-r': 'koi8-r', + 'koi8_r': 'koi8-r', + 'koi8-u': 'koi8-u', + 'csmacintosh': 'macintosh', + 'mac': 'macintosh', + 'macintosh': 'macintosh', + 'x-mac-roman': 'macintosh', + 'dos-874': 'windows-874', + 'iso-8859-11': 'windows-874', + 'iso8859-11': 'windows-874', + 'iso885911': 'windows-874', + 'tis-620': 'windows-874', + 'windows-874': 'windows-874', + 'cp1250': 'windows-1250', + 'windows-1250': 'windows-1250', + 'x-cp1250': 'windows-1250', + 'cp1251': 'windows-1251', + 'windows-1251': 'windows-1251', + 'x-cp1251': 'windows-1251', + 'ansi_x3.4-1968': 'windows-1252', + 'ascii': 'windows-1252', + 'cp1252': 'windows-1252', + 'cp819': 'windows-1252', + 'csisolatin1': 'windows-1252', + 'ibm819': 'windows-1252', + 'iso-8859-1': 'windows-1252', + 'iso-ir-100': 'windows-1252', + 'iso8859-1': 'windows-1252', + 'iso88591': 'windows-1252', + 'iso_8859-1': 'windows-1252', + 'iso_8859-1:1987': 'windows-1252', + 'l1': 'windows-1252', + 'latin1': 'windows-1252', + 'us-ascii': 'windows-1252', + 'windows-1252': 'windows-1252', + 'x-cp1252': 'windows-1252', + 'cp1253': 'windows-1253', + 'windows-1253': 'windows-1253', + 'x-cp1253': 'windows-1253', + 'cp1254': 'windows-1254', + 'csisolatin5': 'windows-1254', + 'iso-8859-9': 'windows-1254', + 'iso-ir-148': 'windows-1254', + 'iso8859-9': 'windows-1254', + 'iso88599': 'windows-1254', + 'iso_8859-9': 'windows-1254', + 'iso_8859-9:1989': 'windows-1254', + 'l5': 'windows-1254', + 'latin5': 'windows-1254', + 'windows-1254': 'windows-1254', + 'x-cp1254': 'windows-1254', + 'cp1255': 'windows-1255', + 'windows-1255': 'windows-1255', + 'x-cp1255': 'windows-1255', + 'cp1256': 'windows-1256', + 'windows-1256': 'windows-1256', + 'x-cp1256': 'windows-1256', + 'cp1257': 'windows-1257', + 'windows-1257': 'windows-1257', + 'x-cp1257': 'windows-1257', + 'cp1258': 'windows-1258', + 'windows-1258': 'windows-1258', + 'x-cp1258': 'windows-1258', + 'x-mac-cyrillic': 'x-mac-cyrillic', + 'x-mac-ukrainian': 'x-mac-cyrillic', + 'chinese': 'gbk', + 'csgb2312': 'gbk', + 'csiso58gb231280': 'gbk', + 'gb2312': 'gbk', + 'gb_2312': 'gbk', + 'gb_2312-80': 'gbk', + 'gbk': 'gbk', + 'iso-ir-58': 'gbk', + 'x-gbk': 'gbk', + 'gb18030': 'gb18030', + 'hz-gb-2312': 'hz-gb-2312', + 'big5': 'big5', + 'big5-hkscs': 'big5', + 'cn-big5': 'big5', + 'csbig5': 'big5', + 'x-x-big5': 'big5', + 'cseucpkdfmtjapanese': 'euc-jp', + 'euc-jp': 'euc-jp', + 'x-euc-jp': 'euc-jp', + 'csiso2022jp': 'iso-2022-jp', + 'iso-2022-jp': 'iso-2022-jp', + 'csshiftjis': 'shift_jis', + 'ms_kanji': 'shift_jis', + 'shift-jis': 'shift_jis', + 'shift_jis': 'shift_jis', + 'sjis': 'shift_jis', + 'windows-31j': 'shift_jis', + 'x-sjis': 'shift_jis', + 'cseuckr': 'euc-kr', + 'csksc56011987': 'euc-kr', + 'euc-kr': 'euc-kr', + 'iso-ir-149': 'euc-kr', + 'korean': 'euc-kr', + 'ks_c_5601-1987': 'euc-kr', + 'ks_c_5601-1989': 'euc-kr', + 'ksc5601': 'euc-kr', + 'ksc_5601': 'euc-kr', + 'windows-949': 'euc-kr', + 'csiso2022kr': 'iso-2022-kr', + 'iso-2022-kr': 'iso-2022-kr', + 'utf-16be': 'utf-16be', + 'utf-16': 'utf-16le', + 'utf-16le': 'utf-16le', + 'x-user-defined': 'x-user-defined', +} diff --git a/html5lib/contrib/webencodings/mklabels.py b/html5lib/contrib/webencodings/mklabels.py new file mode 100644 index 00000000..22168eb2 --- /dev/null +++ b/html5lib/contrib/webencodings/mklabels.py @@ -0,0 +1,56 @@ +""" + + webencodings.mklabels + ~~~~~~~~~~~~~~~~~~~~~ + + Regenarate the webencodings.labels module. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +import json +from urllib.request import urlopen + + +def assert_lower(string): + assert string == string.lower() + return string + + +def generate(url): + parts = ['''\ +""" + + webencodings.labels + ~~~~~~~~~~~~~~~~~~~ + + Map encoding labels to their name. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +# XXX Do not edit! +# This file is automatically generated by mklabels.py + +LABELS = { +'''] + labels = [ + (repr(assert_lower(label)).lstrip('u'), + repr(encoding['name']).lstrip('u')) + for category in json.loads(urlopen(url).read().decode('ascii')) + for encoding in category['encodings'] + for label in encoding['labels']] + max_len = max(len(label) for label, name in labels) + parts.extend( + ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name) + for label, name in labels) + parts.append('}') + return ''.join(parts) + + +if __name__ == '__main__': + print(generate('http://encoding.spec.whatwg.org/encodings.json')) diff --git a/html5lib/contrib/webencodings/tests.py b/html5lib/contrib/webencodings/tests.py new file mode 100644 index 00000000..4852d1f5 --- /dev/null +++ b/html5lib/contrib/webencodings/tests.py @@ -0,0 +1,151 @@ +# coding: utf-8 +""" + + webencodings.tests + ~~~~~~~~~~~~~~~~~~ + + A basic test suite for Encoding. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode, + IncrementalDecoder, IncrementalEncoder, UTF8) + + +def assert_raises(exception, function, *args, **kwargs): + try: + function(*args, **kwargs) + except exception: + return + else: # pragma: no cover + raise AssertionError('Did not raise %s.' % exception) + + +def test_labels(): + assert lookup('utf-8').name == 'utf-8' + assert lookup('Utf-8').name == 'utf-8' + assert lookup('UTF-8').name == 'utf-8' + assert lookup('utf8').name == 'utf-8' + assert lookup('utf8').name == 'utf-8' + assert lookup('utf8 ').name == 'utf-8' + assert lookup(' \r\nutf8\t').name == 'utf-8' + assert lookup('u8') is None # Python label. + assert lookup('utf-8 ') is None # Non-ASCII white space. + + assert lookup('US-ASCII').name == 'windows-1252' + assert lookup('iso-8859-1').name == 'windows-1252' + assert lookup('latin1').name == 'windows-1252' + assert lookup('LATIN1').name == 'windows-1252' + assert lookup('latin-1') is None + assert lookup('LATİN1') is None # ASCII-only case insensitivity. + + +def test_all_labels(): + for label in LABELS: + assert decode(b'', label) == ('', lookup(label)) + assert encode('', label) == b'' + for repeat in [0, 1, 12]: + output, _ = iter_decode([b''] * repeat, label) + assert list(output) == [] + assert list(iter_encode([''] * repeat, label)) == [] + decoder = IncrementalDecoder(label) + assert decoder.decode(b'') == '' + assert decoder.decode(b'', final=True) == '' + encoder = IncrementalEncoder(label) + assert encoder.encode('') == b'' + assert encoder.encode('', final=True) == b'' + # All encoding names are valid labels too: + for name in set(LABELS.values()): + assert lookup(name).name == name + + +def test_invalid_label(): + assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid') + assert_raises(LookupError, encode, 'é', 'invalid') + assert_raises(LookupError, iter_decode, [], 'invalid') + assert_raises(LookupError, iter_encode, [], 'invalid') + assert_raises(LookupError, IncrementalDecoder, 'invalid') + assert_raises(LookupError, IncrementalEncoder, 'invalid') + + +def test_decode(): + assert decode(b'\x80', 'latin1') == ('€', lookup('latin1')) + assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1')) + assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8')) + assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8')) + assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii')) + assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM + + assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM + assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM + assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be')) + assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le')) + + assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be')) + assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le')) + assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le')) + + assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be')) + assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le')) + assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le')) + + +def test_encode(): + assert encode('é', 'latin1') == b'\xe9' + assert encode('é', 'utf8') == b'\xc3\xa9' + assert encode('é', 'utf8') == b'\xc3\xa9' + assert encode('é', 'utf-16') == b'\xe9\x00' + assert encode('é', 'utf-16le') == b'\xe9\x00' + assert encode('é', 'utf-16be') == b'\x00\xe9' + + +def test_iter_decode(): + def iter_decode_to_string(input, fallback_encoding): + output, _encoding = iter_decode(input, fallback_encoding) + return ''.join(output) + assert iter_decode_to_string([], 'latin1') == '' + assert iter_decode_to_string([b''], 'latin1') == '' + assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é' + assert iter_decode_to_string([b'hello'], 'latin1') == 'hello' + assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello' + assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello' + assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD' + assert iter_decode_to_string([ + b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == '' + assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»' + assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é' + assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo' + + +def test_iter_encode(): + assert b''.join(iter_encode([], 'latin1')) == b'' + assert b''.join(iter_encode([''], 'latin1')) == b'' + assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9' + assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9' + assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00' + assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00' + assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9' + assert b''.join(iter_encode([ + '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo' + + +def test_x_user_defined(): + encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca' + decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca' + encoded = b'aa' + decoded = 'aa' + assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined')) + assert encode(decoded, 'x-user-defined') == encoded diff --git a/html5lib/contrib/webencodings/x_user_defined.py b/html5lib/contrib/webencodings/x_user_defined.py new file mode 100644 index 00000000..8467f4f0 --- /dev/null +++ b/html5lib/contrib/webencodings/x_user_defined.py @@ -0,0 +1,323 @@ +# coding: utf-8 +""" + + webencodings.x_user_defined + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + An implementation of the x-user-defined encoding. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +import codecs + + +### Codec APIs + +class Codec(codecs.Codec): + + def encode(self, input, errors='strict'): + return codecs.charmap_encode(input, errors, encoding_table) + + def decode(self, input, errors='strict'): + return codecs.charmap_decode(input, errors, decoding_table) + + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.charmap_encode(input, self.errors, encoding_table)[0] + + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return codecs.charmap_decode(input, self.errors, decoding_table)[0] + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + pass + + +### encodings module API + +codec_info = codecs.CodecInfo( + name='x-user-defined', + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, +) + + +### Decoding Table + +# Python 3: +# for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700)) +decoding_table = ( + '\x00' + '\x01' + '\x02' + '\x03' + '\x04' + '\x05' + '\x06' + '\x07' + '\x08' + '\t' + '\n' + '\x0b' + '\x0c' + '\r' + '\x0e' + '\x0f' + '\x10' + '\x11' + '\x12' + '\x13' + '\x14' + '\x15' + '\x16' + '\x17' + '\x18' + '\x19' + '\x1a' + '\x1b' + '\x1c' + '\x1d' + '\x1e' + '\x1f' + ' ' + '!' + '"' + '#' + '$' + '%' + '&' + "'" + '(' + ')' + '*' + '+' + ',' + '-' + '.' + '/' + '0' + '1' + '2' + '3' + '4' + '5' + '6' + '7' + '8' + '9' + ':' + ';' + '<' + '=' + '>' + '?' + '@' + 'A' + 'B' + 'C' + 'D' + 'E' + 'F' + 'G' + 'H' + 'I' + 'J' + 'K' + 'L' + 'M' + 'N' + 'O' + 'P' + 'Q' + 'R' + 'S' + 'T' + 'U' + 'V' + 'W' + 'X' + 'Y' + 'Z' + '[' + '\\' + ']' + '^' + '_' + '`' + 'a' + 'b' + 'c' + 'd' + 'e' + 'f' + 'g' + 'h' + 'i' + 'j' + 'k' + 'l' + 'm' + 'n' + 'o' + 'p' + 'q' + 'r' + 's' + 't' + 'u' + 'v' + 'w' + 'x' + 'y' + 'z' + '{' + '|' + '}' + '~' + '\x7f' + '\uf780' + '\uf781' + '\uf782' + '\uf783' + '\uf784' + '\uf785' + '\uf786' + '\uf787' + '\uf788' + '\uf789' + '\uf78a' + '\uf78b' + '\uf78c' + '\uf78d' + '\uf78e' + '\uf78f' + '\uf790' + '\uf791' + '\uf792' + '\uf793' + '\uf794' + '\uf795' + '\uf796' + '\uf797' + '\uf798' + '\uf799' + '\uf79a' + '\uf79b' + '\uf79c' + '\uf79d' + '\uf79e' + '\uf79f' + '\uf7a0' + '\uf7a1' + '\uf7a2' + '\uf7a3' + '\uf7a4' + '\uf7a5' + '\uf7a6' + '\uf7a7' + '\uf7a8' + '\uf7a9' + '\uf7aa' + '\uf7ab' + '\uf7ac' + '\uf7ad' + '\uf7ae' + '\uf7af' + '\uf7b0' + '\uf7b1' + '\uf7b2' + '\uf7b3' + '\uf7b4' + '\uf7b5' + '\uf7b6' + '\uf7b7' + '\uf7b8' + '\uf7b9' + '\uf7ba' + '\uf7bb' + '\uf7bc' + '\uf7bd' + '\uf7be' + '\uf7bf' + '\uf7c0' + '\uf7c1' + '\uf7c2' + '\uf7c3' + '\uf7c4' + '\uf7c5' + '\uf7c6' + '\uf7c7' + '\uf7c8' + '\uf7c9' + '\uf7ca' + '\uf7cb' + '\uf7cc' + '\uf7cd' + '\uf7ce' + '\uf7cf' + '\uf7d0' + '\uf7d1' + '\uf7d2' + '\uf7d3' + '\uf7d4' + '\uf7d5' + '\uf7d6' + '\uf7d7' + '\uf7d8' + '\uf7d9' + '\uf7da' + '\uf7db' + '\uf7dc' + '\uf7dd' + '\uf7de' + '\uf7df' + '\uf7e0' + '\uf7e1' + '\uf7e2' + '\uf7e3' + '\uf7e4' + '\uf7e5' + '\uf7e6' + '\uf7e7' + '\uf7e8' + '\uf7e9' + '\uf7ea' + '\uf7eb' + '\uf7ec' + '\uf7ed' + '\uf7ee' + '\uf7ef' + '\uf7f0' + '\uf7f1' + '\uf7f2' + '\uf7f3' + '\uf7f4' + '\uf7f5' + '\uf7f6' + '\uf7f7' + '\uf7f8' + '\uf7f9' + '\uf7fa' + '\uf7fb' + '\uf7fc' + '\uf7fd' + '\uf7fe' + '\uf7ff' +) + +### Encoding table +encoding_table = codecs.charmap_build(decoding_table) diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py index 5ba926e3..c0be95b2 100644 --- a/html5lib/filters/alphabeticalattributes.py +++ b/html5lib/filters/alphabeticalattributes.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from . import base diff --git a/html5lib/filters/base.py b/html5lib/filters/base.py index c7dbaed0..6d6639e6 100644 --- a/html5lib/filters/base.py +++ b/html5lib/filters/base.py @@ -1,7 +1,6 @@ -from __future__ import absolute_import, division, unicode_literals -class Filter(object): +class Filter: def __init__(self, source): self.source = source diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py index aefb5c84..c8dc57b8 100644 --- a/html5lib/filters/inject_meta_charset.py +++ b/html5lib/filters/inject_meta_charset.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from . import base diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index acd4d7a2..0d47f921 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals - -from six import text_type from . import base from ..constants import namespaces, voidElements @@ -33,9 +30,9 @@ def __iter__(self): if type in ("StartTag", "EmptyTag"): namespace = token["namespace"] name = token["name"] - assert namespace is None or isinstance(namespace, text_type) + assert namespace is None or isinstance(namespace, str) assert namespace != "" - assert isinstance(name, text_type) + assert isinstance(name, str) assert name != "" assert isinstance(token["data"], dict) if (not namespace or namespace == namespaces["html"]) and name in voidElements: @@ -45,18 +42,18 @@ def __iter__(self): if type == "StartTag" and self.require_matching_tags: open_elements.append((namespace, name)) for (namespace, name), value in token["data"].items(): - assert namespace is None or isinstance(namespace, text_type) + assert namespace is None or isinstance(namespace, str) assert namespace != "" - assert isinstance(name, text_type) + assert isinstance(name, str) assert name != "" - assert isinstance(value, text_type) + assert isinstance(value, str) elif type == "EndTag": namespace = token["namespace"] name = token["name"] - assert namespace is None or isinstance(namespace, text_type) + assert namespace is None or isinstance(namespace, str) assert namespace != "" - assert isinstance(name, text_type) + assert isinstance(name, str) assert name != "" if (not namespace or namespace == namespaces["html"]) and name in voidElements: assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name} @@ -66,26 +63,26 @@ def __iter__(self): elif type == "Comment": data = token["data"] - assert isinstance(data, text_type) + assert isinstance(data, str) elif type in ("Characters", "SpaceCharacters"): data = token["data"] - assert isinstance(data, text_type) + assert isinstance(data, str) assert data != "" if type == "SpaceCharacters": assert data.strip(spaceCharacters) == "" elif type == "Doctype": name = token["name"] - assert name is None or isinstance(name, text_type) - assert token["publicId"] is None or isinstance(name, text_type) - assert token["systemId"] is None or isinstance(name, text_type) + assert name is None or isinstance(name, str) + assert token["publicId"] is None or isinstance(name, str) + assert token["systemId"] is None or isinstance(name, str) elif type == "Entity": - assert isinstance(token["name"], text_type) + assert isinstance(token["name"], str) elif type == "SerializerError": - assert isinstance(token["data"], text_type) + assert isinstance(token["data"], str) else: assert False, "Unknown token type: %(type)s" % {"type": type} diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py index 4a865012..a44b2a00 100644 --- a/html5lib/filters/optionaltags.py +++ b/html5lib/filters/optionaltags.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from . import base diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index ea2c5dd3..94c8602c 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -6,14 +6,12 @@ if Bleach is unsuitable for your needs. """ -from __future__ import absolute_import, division, unicode_literals import re import warnings +from urllib.parse import urlparse from xml.sax.saxutils import escape, unescape -from six.moves import urllib_parse as urlparse - from . import base from ..constants import namespaces, prefixes @@ -846,7 +844,7 @@ def allowed_token(self, token): # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") try: - uri = urlparse.urlparse(val_unescaped) + uri = urlparse(val_unescaped) except ValueError: uri = None del attrs[attr] diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py index 0d12584b..ab40ef5a 100644 --- a/html5lib/filters/whitespace.py +++ b/html5lib/filters/whitespace.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import re diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index b3c206d1..91d71a88 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals -from six import viewkeys - from . import _inputstream from . import _tokenizer @@ -69,7 +66,7 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen return p.parseFragment(doc, container=container, **kwargs) -class HTMLParser(object): +class HTMLParser: """HTML parser Generates a tree structure from a stream of (possibly malformed) HTML. @@ -397,7 +394,7 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] -class Phase(object): +class Phase: """Base class for helper object that implements each phase of processing """ __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") @@ -428,7 +425,7 @@ def processSpaceCharacters(self, token): def processStartTag(self, token): # Note the caching is done here rather than BoundMethodDispatcher as doing it there # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs + # (CPython 3.8) GC cost when parsing many short inputs name = token["name"] # In Py2, using `in` is quicker in general than try/except KeyError # In Py3, `in` is quicker when there are few cache hits (typically short inputs) @@ -455,7 +452,7 @@ def startTagHtml(self, token): def processEndTag(self, token): # Note the caching is done here rather than BoundMethodDispatcher as doing it there # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs + # (CPython 3.8) GC cost when parsing many short inputs name = token["name"] # In Py2, using `in` is quicker in general than try/except KeyError # In Py3, `in` is quicker when there are few cache hits (typically short inputs) @@ -2774,7 +2771,7 @@ def processEndTag(self, token): def adjust_attributes(token, replacements): - needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) + needs_adjustment = token['data'].keys() & replacements.keys() if needs_adjustment: token['data'] = type(token['data'])((replacements.get(k, k), v) for k, v in token['data'].items()) diff --git a/html5lib/serializer.py b/html5lib/serializer.py index a171ac1c..ed52593f 100644 --- a/html5lib/serializer.py +++ b/html5lib/serializer.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals -from six import text_type - import re from codecs import register_error, xmlcharrefreplace_errors @@ -101,7 +98,7 @@ def serialize(input, tree="etree", encoding=None, **serializer_opts): return s.render(walker(input), encoding) -class HTMLSerializer(object): +class HTMLSerializer: # attribute quoting options quote_attr_values = "legacy" # be secure by default @@ -222,14 +219,14 @@ def __init__(self, **kwargs): self.strict = False def encode(self, string): - assert isinstance(string, text_type) + assert isinstance(string, str) if self.encoding: return string.encode(self.encoding, "htmlentityreplace") else: return string def encodeStrict(self, string): - assert isinstance(string, text_type) + assert isinstance(string, str) if self.encoding: return string.encode(self.encoding, "strict") else: diff --git a/html5lib/tests/__init__.py b/html5lib/tests/__init__.py index b8ce2de3..e69de29b 100644 --- a/html5lib/tests/__init__.py +++ b/html5lib/tests/__init__.py @@ -1 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals diff --git a/html5lib/tests/conftest.py b/html5lib/tests/conftest.py index fffeb50c..de9b1572 100644 --- a/html5lib/tests/conftest.py +++ b/html5lib/tests/conftest.py @@ -1,4 +1,3 @@ -from __future__ import print_function import os.path import sys @@ -54,7 +53,7 @@ def pytest_configure(config): # Check for optional requirements req_file = os.path.join(_root, "requirements-optional.txt") if os.path.exists(req_file): - with open(req_file, "r") as fp: + with open(req_file) as fp: for line in fp: if (line.strip() and not (line.startswith("-r") or @@ -79,7 +78,7 @@ def pytest_configure(config): import xml.etree.ElementTree as ElementTree try: - import xml.etree.cElementTree as cElementTree + import xml.etree.ElementTree as cElementTree except ImportError: msgs.append("cElementTree unable to be imported") else: diff --git a/html5lib/tests/sanitizer.py b/html5lib/tests/sanitizer.py index 16e53868..93ad4f52 100644 --- a/html5lib/tests/sanitizer.py +++ b/html5lib/tests/sanitizer.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import codecs import json diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index 1bd0ccc1..3a6f37c2 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals # pylint:disable=wrong-import-position @@ -86,7 +85,7 @@ def __getitem__(self, key): return dict.get(self, key, self.default) -class TestData(object): +class TestData: def __init__(self, filename, newTestHeading="data", encoding="utf8"): if encoding is None: self.f = open(filename, mode="rb") diff --git a/html5lib/tests/test_alphabeticalattributes.py b/html5lib/tests/test_alphabeticalattributes.py index 7d5b8e0f..87beb8f1 100644 --- a/html5lib/tests/test_alphabeticalattributes.py +++ b/html5lib/tests/test_alphabeticalattributes.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from collections import OrderedDict diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 47c4814a..10b666da 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import os @@ -9,7 +8,7 @@ def test_basic_prescan_length(): - data = "Caf\u00E9".encode('utf-8') + data = "Caf\u00E9".encode() pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity @@ -18,7 +17,7 @@ def test_basic_prescan_length(): def test_parser_reparse(): - data = "Caf\u00E9".encode('utf-8') + data = "Caf\u00E9".encode() pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity diff --git a/html5lib/tests/test_meta.py b/html5lib/tests/test_meta.py index e02268aa..2fc6140d 100644 --- a/html5lib/tests/test_meta.py +++ b/html5lib/tests/test_meta.py @@ -1,10 +1,4 @@ -from __future__ import absolute_import, division, unicode_literals - -import six -try: - from unittest.mock import Mock -except ImportError: - from mock import Mock +from unittest.mock import Mock from . import support @@ -30,11 +24,7 @@ def test_errorMessage(): r = support.errorMessage(input, expected, actual) # Assertions! - if six.PY2: - assert b"Input:\n1\nExpected:\n2\nReceived\n3\n" == r - else: - assert six.PY3 - assert "Input:\n1\nExpected:\n2\nReceived\n3\n" == r + assert "Input:\n1\nExpected:\n2\nReceived\n3\n" == r assert input.__repr__.call_count == 1 assert expected.__repr__.call_count == 1 diff --git a/html5lib/tests/test_optionaltags_filter.py b/html5lib/tests/test_optionaltags_filter.py index cd282149..180a109e 100644 --- a/html5lib/tests/test_optionaltags_filter.py +++ b/html5lib/tests/test_optionaltags_filter.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from html5lib.filters.optionaltags import Filter diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 6b464bea..da76cd41 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -1,7 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals - -from six import PY2, text_type - import io from . import support # noqa @@ -74,11 +70,6 @@ def test_debug_log(): ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})] - if PY2: - for i, log in enumerate(expected): - log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log] - expected[i] = tuple(log) - assert parser.log == expected diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 499310b6..562ee7fa 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import warnings diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index a2be0be5..5c225790 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import os import json diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index efe9b472..0512419c 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from . import support # noqa @@ -8,8 +7,8 @@ import pytest -import six -from six.moves import http_client, urllib +import http.client +import urllib.response from html5lib._inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) @@ -105,7 +104,7 @@ def test_char_ascii(): def test_char_utf8(): - stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8') + stream = HTMLInputStream('\u2018'.encode(), override_encoding='utf-8') assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == '\u2018' @@ -186,12 +185,12 @@ def test_python_issue_20007(): Make sure we have a work-around for Python bug #20007 http://bugs.python.org/issue20007 """ - class FakeSocket(object): + class FakeSocket: def makefile(self, _mode, _bufsize=None): # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") - source = http_client.HTTPResponse(FakeSocket()) + source = http.client.HTTPResponse(FakeSocket()) source.begin() stream = HTMLInputStream(source) assert stream.charsUntil(" ") == "Text" @@ -202,15 +201,12 @@ def test_python_issue_20007_b(): Make sure we have a work-around for Python bug #20007 http://bugs.python.org/issue20007 """ - if six.PY2: - return - - class FakeSocket(object): + class FakeSocket: def makefile(self, _mode, _bufsize=None): # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") - source = http_client.HTTPResponse(FakeSocket()) + source = http.client.HTTPResponse(FakeSocket()) source.begin() wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com") stream = HTMLInputStream(wrapped) diff --git a/html5lib/tests/test_tokenizer2.py b/html5lib/tests/test_tokenizer2.py index 158d847a..4e993571 100644 --- a/html5lib/tests/test_tokenizer2.py +++ b/html5lib/tests/test_tokenizer2.py @@ -1,9 +1,6 @@ -from __future__ import absolute_import, division, unicode_literals import io -from six import unichr, text_type - from html5lib._tokenizer import HTMLTokenizer from html5lib.constants import tokenTypes @@ -16,7 +13,7 @@ def ignore_parse_errors(toks): def test_maintain_attribute_order(): # generate loads to maximize the chance a hash-based mutation will occur - attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))] + attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))] stream = io.StringIO("") toks = HTMLTokenizer(stream) @@ -49,7 +46,7 @@ def test_duplicate_attribute(): def test_maintain_duplicate_attribute_order(): # generate loads to maximize the chance a hash-based mutation will occur - attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))] + attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))] stream = io.StringIO("") toks = HTMLTokenizer(stream) diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py index 95e56c00..3af383c3 100644 --- a/html5lib/tests/test_treeadapters.py +++ b/html5lib/tests/test_treeadapters.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from . import support # noqa diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 780ca964..22ee0cb7 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -1,9 +1,7 @@ -from __future__ import absolute_import, division, unicode_literals import itertools import sys -from six import unichr, text_type import pytest try: @@ -74,11 +72,11 @@ def param_treewalker_six_mix(): # fragment but not using the u'' syntax nor importing unicode_literals sm_tests = [ ('Example', - [(str('class'), str('test123'))], + [('class', 'test123')], '\n class="test123"\n href="http://example.com"\n "Example"'), ('', - [(str('rel'), str('alternate'))], + [('rel', 'alternate')], '\n href="http://example.com/cow"\n rel="alternate"\n "Example"') ] @@ -151,7 +149,7 @@ def test_maintain_attribute_order(treeName): pytest.skip("Treebuilder not loaded") # generate loads to maximize the chance a hash-based mutation will occur - attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))] + attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))] data = "" parser = html5parser.HTMLParser(tree=treeAPIs["builder"]) diff --git a/html5lib/tests/test_whitespace_filter.py b/html5lib/tests/test_whitespace_filter.py index e9da6140..d4e4e3be 100644 --- a/html5lib/tests/test_whitespace_filter.py +++ b/html5lib/tests/test_whitespace_filter.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from html5lib.filters.whitespace import Filter from html5lib.constants import spaceCharacters diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index b49d2e6e..d2605a12 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import codecs import json @@ -6,13 +5,12 @@ import re import pytest -from six import unichr from html5lib._tokenizer import HTMLTokenizer from html5lib import constants, _utils -class TokenizerTestParser(object): +class TokenizerTestParser: def __init__(self, initialState, lastStartTag=None): self.tokenizer = HTMLTokenizer self._state = initialState @@ -146,15 +144,15 @@ def repl(m): low = int(m.group(2), 16) if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF: cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 - return unichr(cp) + return chr(cp) else: - return unichr(high) + unichr(low) + return chr(high) + chr(low) else: - return unichr(int(m.group(1), 16)) + return chr(int(m.group(1), 16)) try: return _surrogateRe.sub(repl, inp) except ValueError: - # This occurs when unichr throws ValueError, which should + # This occurs when chr throws ValueError, which should # only be for a lone-surrogate. if _utils.supports_lone_surrogates: raise diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py index 42463f32..6c0b4f77 100644 --- a/html5lib/tests/tokenizertotree.py +++ b/html5lib/tests/tokenizertotree.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import sys import os @@ -25,7 +24,7 @@ def main(out_path): def run_file(filename, out_path): try: - tests_data = json.load(open(filename, "r")) + tests_data = json.load(open(filename)) except ValueError: sys.stderr.write("Failed to load %s\n" % filename) return diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py index 363b48c2..e2381754 100644 --- a/html5lib/tests/tree_construction.py +++ b/html5lib/tests/tree_construction.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals import itertools import re diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py index dfeb0ba5..1444fc9a 100644 --- a/html5lib/treeadapters/__init__.py +++ b/html5lib/treeadapters/__init__.py @@ -16,7 +16,6 @@ genshi_tree = genshi.to_genshi(TreeWalker(tree)) """ -from __future__ import absolute_import, division, unicode_literals from . import sax diff --git a/html5lib/treeadapters/genshi.py b/html5lib/treeadapters/genshi.py index 61d5fb6a..b0b29ed3 100644 --- a/html5lib/treeadapters/genshi.py +++ b/html5lib/treeadapters/genshi.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from genshi.core import QName, Attrs from genshi.core import START, END, TEXT, COMMENT, DOCTYPE diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py index f4ccea5a..ead1a5c4 100644 --- a/html5lib/treeadapters/sax.py +++ b/html5lib/treeadapters/sax.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from xml.sax.xmlreader import AttributesNSImpl diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py index d44447ea..90aad5fb 100644 --- a/html5lib/treebuilders/__init__.py +++ b/html5lib/treebuilders/__init__.py @@ -29,7 +29,6 @@ """ -from __future__ import absolute_import, division, unicode_literals from .._utils import default_etree diff --git a/html5lib/treebuilders/base.py b/html5lib/treebuilders/base.py index 020d7e15..3fec12c4 100644 --- a/html5lib/treebuilders/base.py +++ b/html5lib/treebuilders/base.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals -from six import text_type - from ..constants import scopingElements, tableInsertModeElements, namespaces # The scope markers are inserted when entering object elements, @@ -20,7 +17,7 @@ } -class Node(object): +class Node: """Represents an item in the tree""" def __init__(self, name): """Creates a Node @@ -144,7 +141,7 @@ def nodesEqual(self, node1, node2): return True -class TreeBuilder(object): +class TreeBuilder: """Base treebuilder implementation * documentClass - the class to use for the bottommost node of a document @@ -200,7 +197,7 @@ def elementInScope(self, target, variant=None): # match any node with that name exactNode = hasattr(target, "nameTuple") if not exactNode: - if isinstance(target, text_type): + if isinstance(target, str): target = (namespaces["html"], target) assert isinstance(target, tuple) @@ -323,7 +320,7 @@ def _setInsertFromTable(self, value): def insertElementNormal(self, token): name = token["name"] - assert isinstance(name, text_type), "Element %s not unicode" % name + assert isinstance(name, str), "Element %s not unicode" % name namespace = token.get("namespace", self.defaultNamespace) element = self.elementClass(name, namespace) element.attributes = token["data"] diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index d8b53004..bc56c708 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -1,10 +1,6 @@ -from __future__ import absolute_import, division, unicode_literals -try: - from collections.abc import MutableMapping -except ImportError: # Python 2.7 - from collections import MutableMapping +from collections.abc import MutableMapping from xml.dom import minidom, Node import weakref diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 0b745081..f9564fe0 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -1,8 +1,5 @@ -from __future__ import absolute_import, division, unicode_literals # pylint:disable=protected-access -from six import text_type - import re from copy import copy @@ -222,7 +219,7 @@ def serializeElement(element, indent=0): elif element.tag == ElementTreeCommentType: rv.append("|%s" % (' ' * indent, element.text)) else: - assert isinstance(element.tag, text_type), \ + assert isinstance(element.tag, str), \ "Expected unicode, got %s, %s" % (type(element.tag), element.tag) nsmatch = tag_regexp.match(element.tag) diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index e73de61a..b0be4617 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -9,17 +9,13 @@ When any of these things occur, we emit a DataLossWarning """ -from __future__ import absolute_import, division, unicode_literals # pylint:disable=protected-access import warnings import re import sys -try: - from collections.abc import MutableMapping -except ImportError: - from collections import MutableMapping +from collections.abc import MutableMapping from . import base from ..constants import DataLossWarning @@ -28,7 +24,6 @@ from .. import _ihatexml import lxml.etree as etree -from six import PY3, binary_type fullTree = True @@ -37,14 +32,14 @@ comment_type = etree.Comment("asd").tag -class DocumentType(object): +class DocumentType: def __init__(self, name, publicId, systemId): self.name = name self.publicId = publicId self.systemId = systemId -class Document(object): +class Document: def __init__(self): self._elementTree = None self._childNodes = [] @@ -208,8 +203,6 @@ def _coerceKey(self, key): def __getitem__(self, key): value = self._element._element.attrib[self._coerceKey(key)] - if not PY3 and isinstance(value, binary_type): - value = value.decode("ascii") return value def __setitem__(self, key, value): diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py index b2d3aac3..b78d6f46 100644 --- a/html5lib/treewalkers/__init__.py +++ b/html5lib/treewalkers/__init__.py @@ -8,7 +8,6 @@ returns an iterator which generates tokens. """ -from __future__ import absolute_import, division, unicode_literals from .. import constants from .._utils import default_etree diff --git a/html5lib/treewalkers/base.py b/html5lib/treewalkers/base.py index 80c474c4..7ee75d81 100644 --- a/html5lib/treewalkers/base.py +++ b/html5lib/treewalkers/base.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from xml.dom import Node from ..constants import namespaces, voidElements, spaceCharacters @@ -17,7 +16,7 @@ spaceCharacters = "".join(spaceCharacters) -class TreeWalker(object): +class TreeWalker: """Walks a tree yielding tokens Tokens are dicts that all have a ``type`` field specifying the type of the diff --git a/html5lib/treewalkers/dom.py b/html5lib/treewalkers/dom.py index b0c89b00..85e12505 100644 --- a/html5lib/treewalkers/dom.py +++ b/html5lib/treewalkers/dom.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from xml.dom import Node diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 411a1d45..41607f52 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -1,10 +1,7 @@ -from __future__ import absolute_import, division, unicode_literals from collections import OrderedDict import re -from six import string_types - from . import base from .._utils import moduleFactoryFactory @@ -51,7 +48,7 @@ def getNodeDetails(self, node): return base.COMMENT, node.text else: - assert isinstance(node.tag, string_types), type(node.tag) + assert isinstance(node.tag, str), type(node.tag) # This is assumed to be an ordinary element match = tag_regexp.match(node.tag) if match: diff --git a/html5lib/treewalkers/etree_lxml.py b/html5lib/treewalkers/etree_lxml.py index a614ac5b..0ec633ac 100644 --- a/html5lib/treewalkers/etree_lxml.py +++ b/html5lib/treewalkers/etree_lxml.py @@ -1,6 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals -from six import text_type - from collections import OrderedDict from lxml import etree @@ -14,13 +11,13 @@ def ensure_str(s): if s is None: return None - elif isinstance(s, text_type): + elif isinstance(s, str): return s else: return s.decode("ascii", "strict") -class Root(object): +class Root: def __init__(self, et): self.elementtree = et self.children = [] @@ -58,7 +55,7 @@ def __len__(self): return 1 -class Doctype(object): +class Doctype: def __init__(self, root_node, name, public_id, system_id): self.root_node = root_node self.name = name @@ -81,7 +78,7 @@ def getnext(self): return None -class FragmentWrapper(object): +class FragmentWrapper: def __init__(self, fragment_root, obj): self.root_node = fragment_root self.obj = obj diff --git a/html5lib/treewalkers/genshi.py b/html5lib/treewalkers/genshi.py index 7483be27..78f22fd3 100644 --- a/html5lib/treewalkers/genshi.py +++ b/html5lib/treewalkers/genshi.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import, division, unicode_literals from genshi.core import QName from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT diff --git a/parse.py b/parse.py index e6806b46..14bbe99a 100755 --- a/parse.py +++ b/parse.py @@ -42,7 +42,7 @@ def parse(): try: # Try opening from file system f = open(f, "rb") - except IOError as e: + except OSError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: diff --git a/requirements-oldest.txt b/requirements-oldest.txt index 68d0f13d..62d203ae 100644 --- a/requirements-oldest.txt +++ b/requirements-oldest.txt @@ -1,8 +1,7 @@ # This allows us to install the actually oldest supported dependencies and test whether that works. # requirements.txt -six==1.9 -webencodings==0.5.1 +# (nothing) # requirements-optional.txt genshi==0.7.1 ; python_version < '3.8' @@ -26,4 +25,4 @@ pytest==5.4.2 ; python_version >= '3' coverage==5.1 pytest-expect==1.1.0 mock==3.0.5 ; python_version < '3.6' -mock==4.0.2 ; python_version >= '3.6' \ No newline at end of file +mock==4.0.2 ; python_version >= '3.6' diff --git a/requirements-test.txt b/requirements-test.txt index aca31f5e..1415d163 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,5 +6,6 @@ pytest>=4.6.10,<5 ; python_version < '3' pytest>=5.4.2,<8 ; python_version >= '3' coverage>=5.1,<6 pytest-expect>=1.1.0,<2 +six>=1.9 # required by pytest-expect mock>=3.0.5,<4 ; python_version < '3.3' setuptools; python_version >= '3.12' diff --git a/requirements.txt b/requirements.txt index ae7ec3d0..e69de29b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +0,0 @@ -six>=1.9 -webencodings diff --git a/setup.py b/setup.py index 30ee0575..638997c8 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -from __future__ import print_function import ast import codecs @@ -64,11 +63,7 @@ def default_environment(): 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', @@ -106,11 +101,8 @@ def default_environment(): maintainer='James Graham', maintainer_email='james@hoppipolla.co.uk', packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), - install_requires=[ - 'six>=1.9', - 'webencodings>=0.5.1', - ], - python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*", + install_requires=[], + python_requires=">=3.8", extras_require={ # A conditional extra will only install these items when the extra is # requested and the condition matches. diff --git a/tox.ini b/tox.ini index fb228e96..94a78542 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{27,35,36,37,38,39,310,311,py,py3}-{base,optional,oldest} +envlist = py{38,39,310,311,py,py3}-{base,optional,oldest} [testenv] deps = diff --git a/toxver.py b/toxver.py index 68eb71ec..950dc083 100755 --- a/toxver.py +++ b/toxver.py @@ -12,18 +12,11 @@ $ toxver.py pypy-3.8 base TOXENV=pypy3-base - $ toxver.py 2.7 oldest - TOXENV=py27-oldest - $ toxver.py ~3.12.0-0 optional TOXENV=py312-optional """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals import sys @@ -35,10 +28,6 @@ def main(argv): deps = argv[2] - if argv[1].startswith("pypy-2"): - print("TOXENV=pypy-" + deps) - return 0 - if argv[1].startswith("pypy-3"): print("TOXENV=pypy3-" + deps) return 0