From befaf7f4569578e8d1bdb87f10fb026aadaf9e6d Mon Sep 17 00:00:00 2001 From: James Addison Date: Wed, 30 Dec 2020 23:58:02 +0000 Subject: [PATCH 1/6] Use 'break' statement to early-terminate parser loop; also reduces indentation level of token handling logic --- html5lib/html5parser.py | 65 ++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 74d829d9..a8e987d9 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -205,45 +205,44 @@ def mainLoop(self): prev_token = None new_token = token while new_token is not None: + type = new_token["type"] + if type == ParseErrorToken: + self.parseError(new_token["data"], new_token.get("datavars", {})) + break + prev_token = new_token currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None - type = new_token["type"] - - if type == ParseErrorToken: - self.parseError(new_token["data"], new_token.get("datavars", {})) - new_token = None + if (len(self.tree.openElements) == 0 or + currentNodeNamespace == self.tree.defaultNamespace or + (self.isMathMLTextIntegrationPoint(currentNode) and + ((type == StartTagToken and + token["name"] not in frozenset(["mglyph", "malignmark"])) or + type in (CharactersToken, SpaceCharactersToken))) or + (currentNodeNamespace == namespaces["mathml"] and + currentNodeName == "annotation-xml" and + type == StartTagToken and + token["name"] == "svg") or + (self.isHTMLIntegrationPoint(currentNode) and + type in (StartTagToken, CharactersToken, SpaceCharactersToken))): + phase = self.phase else: - if (len(self.tree.openElements) == 0 or - currentNodeNamespace == self.tree.defaultNamespace or - (self.isMathMLTextIntegrationPoint(currentNode) and - ((type == StartTagToken and - token["name"] not in frozenset(["mglyph", "malignmark"])) or - type in (CharactersToken, SpaceCharactersToken))) or - (currentNodeNamespace == namespaces["mathml"] and - currentNodeName == "annotation-xml" and - type == StartTagToken and - token["name"] == "svg") or - (self.isHTMLIntegrationPoint(currentNode) and - type in (StartTagToken, CharactersToken, SpaceCharactersToken))): - phase = self.phase - else: - phase = self.phases["inForeignContent"] - - if type == CharactersToken: - new_token = phase.processCharacters(new_token) - elif type == SpaceCharactersToken: - new_token = phase.processSpaceCharacters(new_token) - elif type == StartTagToken: - new_token = phase.processStartTag(new_token) - elif type == EndTagToken: - new_token = phase.processEndTag(new_token) - elif type == CommentToken: - new_token = phase.processComment(new_token) - elif type == DoctypeToken: - new_token = phase.processDoctype(new_token) + phase = self.phases["inForeignContent"] + + if type == CharactersToken: + new_token = phase.processCharacters(new_token) + elif type == SpaceCharactersToken: + new_token = phase.processSpaceCharacters(new_token) + elif type == StartTagToken: + new_token = phase.processStartTag(new_token) + elif type == EndTagToken: + new_token = phase.processEndTag(new_token) + elif type == CommentToken: + new_token = phase.processComment(new_token) + elif type == DoctypeToken: + new_token = phase.processDoctype(new_token) if (type == StartTagToken and prev_token["selfClosing"] and not prev_token["selfClosingAcknowledged"]): From d87ceb93c33769e9e94f9d510ee86540225c4003 Mon Sep 17 00:00:00 2001 From: James Addison Date: Thu, 31 Dec 2020 00:01:19 +0000 Subject: [PATCH 2/6] Unpack gnarly conditional check within parser loop --- html5lib/html5parser.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index a8e987d9..8f8b527a 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -215,21 +215,22 @@ def mainLoop(self): currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None - if (len(self.tree.openElements) == 0 or - currentNodeNamespace == self.tree.defaultNamespace or - (self.isMathMLTextIntegrationPoint(currentNode) and - ((type == StartTagToken and - token["name"] not in frozenset(["mglyph", "malignmark"])) or - type in (CharactersToken, SpaceCharactersToken))) or - (currentNodeNamespace == namespaces["mathml"] and - currentNodeName == "annotation-xml" and - type == StartTagToken and - token["name"] == "svg") or - (self.isHTMLIntegrationPoint(currentNode) and - type in (StartTagToken, CharactersToken, SpaceCharactersToken))): + phase = self.phases["inForeignContent"] + if len(self.tree.openElements) == 0: phase = self.phase - else: - phase = self.phases["inForeignContent"] + elif currentNodeNamespace == self.tree.defaultNamespace: + phase = self.phase + elif self.isMathMLTextIntegrationPoint(currentNode): + if type == StartTagToken and token["name"] not in frozenset(["mglyph", "malignmark"]): + phase = self.phase + elif type in (CharactersToken, SpaceCharactersToken): + phase = self.phase + elif currentNodeNamespace == namespaces["mathml"] and currentNodeName == "annotation-xml": + if type == StartTagToken and token["name"] == "svg": + phase = self.phase + elif self.isHTMLIntegrationPoint(currentNode): + if type in (StartTagToken, CharactersToken, SpaceCharactersToken): + phase = self.phase if type == CharactersToken: new_token = phase.processCharacters(new_token) From e12523d0add57df4044a8c08c26e7abd76a366ad Mon Sep 17 00:00:00 2001 From: James Addison Date: Thu, 31 Dec 2020 00:01:58 +0000 Subject: [PATCH 3/6] Re-order foreign content conditional checks --- html5lib/html5parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 8f8b527a..a87914c3 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -220,6 +220,9 @@ def mainLoop(self): phase = self.phase elif currentNodeNamespace == self.tree.defaultNamespace: phase = self.phase + elif self.isHTMLIntegrationPoint(currentNode): + if type in (StartTagToken, CharactersToken, SpaceCharactersToken): + phase = self.phase elif self.isMathMLTextIntegrationPoint(currentNode): if type == StartTagToken and token["name"] not in frozenset(["mglyph", "malignmark"]): phase = self.phase @@ -228,9 +231,6 @@ def mainLoop(self): elif currentNodeNamespace == namespaces["mathml"] and currentNodeName == "annotation-xml": if type == StartTagToken and token["name"] == "svg": phase = self.phase - elif self.isHTMLIntegrationPoint(currentNode): - if type in (StartTagToken, CharactersToken, SpaceCharactersToken): - phase = self.phase if type == CharactersToken: new_token = phase.processCharacters(new_token) From f7d256fc2fdb7ba9ccf312f2122d321a7eebc64e Mon Sep 17 00:00:00 2001 From: James Addison Date: Thu, 31 Dec 2020 00:02:37 +0000 Subject: [PATCH 4/6] Simplify current node presence test --- html5lib/html5parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index a87914c3..0c0d7a33 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -216,7 +216,7 @@ def mainLoop(self): currentNodeName = currentNode.name if currentNode else None phase = self.phases["inForeignContent"] - if len(self.tree.openElements) == 0: + if currentNode is None: phase = self.phase elif currentNodeNamespace == self.tree.defaultNamespace: phase = self.phase From 369a4123ae453845a383dafb82e7459a4e87839b Mon Sep 17 00:00:00 2001 From: James Addison Date: Thu, 31 Dec 2020 00:03:52 +0000 Subject: [PATCH 5/6] Remove usage of rarely-referenced current node variables --- html5lib/html5parser.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 0c0d7a33..bc7fa134 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -212,13 +212,11 @@ def mainLoop(self): prev_token = new_token currentNode = self.tree.openElements[-1] if self.tree.openElements else None - currentNodeNamespace = currentNode.namespace if currentNode else None - currentNodeName = currentNode.name if currentNode else None phase = self.phases["inForeignContent"] if currentNode is None: phase = self.phase - elif currentNodeNamespace == self.tree.defaultNamespace: + elif currentNode.namespace == self.tree.defaultNamespace: phase = self.phase elif self.isHTMLIntegrationPoint(currentNode): if type in (StartTagToken, CharactersToken, SpaceCharactersToken): @@ -228,7 +226,7 @@ def mainLoop(self): phase = self.phase elif type in (CharactersToken, SpaceCharactersToken): phase = self.phase - elif currentNodeNamespace == namespaces["mathml"] and currentNodeName == "annotation-xml": + elif currentNode.namespace == namespaces["mathml"] and currentNode.name == "annotation-xml": if type == StartTagToken and token["name"] == "svg": phase = self.phase From 1c3047bd470e47d4119406776f9ccef38cdacaea Mon Sep 17 00:00:00 2001 From: James Addison Date: Sat, 9 Jan 2021 12:42:25 +0000 Subject: [PATCH 6/6] Adjustment: use a single boolean to track whether we're in a foreign content phase This is based on the idea that it's likely easier to understand the code -- and that it's hopefully less fragile -- if there is a single boolean with a readable name rather than repeated assignments to a variable that is invoked as a method call later --- html5lib/html5parser.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index bc7fa134..f06a1008 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -213,23 +213,24 @@ def mainLoop(self): prev_token = new_token currentNode = self.tree.openElements[-1] if self.tree.openElements else None - phase = self.phases["inForeignContent"] + in_foreign_content = True if currentNode is None: - phase = self.phase + in_foreign_content = False elif currentNode.namespace == self.tree.defaultNamespace: - phase = self.phase + in_foreign_content = False elif self.isHTMLIntegrationPoint(currentNode): if type in (StartTagToken, CharactersToken, SpaceCharactersToken): - phase = self.phase + in_foreign_content = False elif self.isMathMLTextIntegrationPoint(currentNode): if type == StartTagToken and token["name"] not in frozenset(["mglyph", "malignmark"]): - phase = self.phase + in_foreign_content = False elif type in (CharactersToken, SpaceCharactersToken): - phase = self.phase + in_foreign_content = False elif currentNode.namespace == namespaces["mathml"] and currentNode.name == "annotation-xml": if type == StartTagToken and token["name"] == "svg": - phase = self.phase + in_foreign_content = False + phase = self.phases["inForeignContent"] if in_foreign_content else self.phase if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: