I'm not a Python specialist, so bear with me. I'm trying to replace a Perl HTML::TokeParser based parser that I use for template foreign language translation to use Python html.parser. Here's the prototype code which nearly gives me what I want.
import deepl
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
result = '<' + tag + '>'
print('start ' + str(result))
for attr in attrs:
print(" attr:", attr)
def handle_endtag(self, tag):
result = '</' + tag + '>'
print('end ' + str(result))
#print("End tag :", tag)
def handle_data(self, data):
self.translate_data(data)
#print("Data :", data)
etc. etc. and
deepl_client = deepl.DeepLClient(auth_key)
#Translate a formal document from English to French
input_path = "blabla"
output_path = "blabla"
parser = MyHTMLParser()
with open(input_path, 'r') as file:
content = file.read()
parser.feed(content)
However I'd also like access to the raw HTML as it goes through the feed to avoid re-assembling the simpler or non-translated tags.