2

I want to validate HTML tag to determine whether it is valid HTML or not.

I tried the following code.

 def must_have_proper_htmltag(self,filename):
        """

        :param filename:
        :return:
        """
        print(filename)
        return bool(BeautifulSoup(filename, 'html.parser').find())

        '''Beautiful soup is library function to pull HTML or XML
           html.parser is for choosing html doc and find() for
           checking occuranc'''
`htmltags='<html><head><title>Test</title></head>''<body><h1>Parse me!</h1>
 </body></html>'
nohtmltag = '<html><head><title>Test</title></head>''<body><h1>Parse me!'
print('html checkers:-',qc.must_have_proper_htmltag(htmltag))
print('html checkers:-',qc.must_have_proper_htmltag(nohtmltag))

This function check whether there is HTML tag or not..it doesn't validate whether the HTML tag is right or not.

How can I validate HTML tag? I want output that HTML tag one is True other one is False

2 Answers 2

2

You can validate it using w3c validator

from py_w3c.validators.html.validator import HTMLValidator

def must_have_proper_htmltag(self,filename):
    """

    :param filename:
    :return:
    """
    print(filename)
    html_validator = HTMLValidator()
    html_validator.validate_fragment(filename)
    if not html_validator.errors:
        '''Where there is no error it return empty list'''
        return True
    else:
        return False

print('html checkers:-',qc.must_have_proper_htmltag('<!DOCTYPE html><html>
<head><title>Test</title></head>''<body><h1>Parse me!</h1></body></html>'))
Sign up to request clarification or add additional context in comments.

Comments

2

While not an exact match to your requirement, maybe it's easier to leverage work others have already done. For example:

It does not check for single tags but for the whole of the HTML to be correct, which is apparently what you're after.

Possibly this approach may help you as well:

import HTMLParser
import urllib
import sys
import urlparse

##################################################
# config

base_url = 'http://foo.com/bill_reid/'
depth = 100

w3c_validator = 'http://validator.w3.org/'

##################################################
# classes and functions

# HTML parser class
class parseLinks(HTMLParser.HTMLParser):
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for name,value in attrs:
                if name == 'href':
                    url = url_normalize(value)
                    if url != "" and not(l.has_key(url)):
                        l[url] = True;

# HTML parsing function (use the class)
def parse_links(url):
    try:
        lParser = parseLinks()
        lParser.feed(urllib.urlopen(url).read())
        lParser.close()
    except:
        pass

# clean/normalize/reject url
def url_normalize(url):
    url= url.strip()

    # check it's not an email address
    if url.startswith('mailto:'):
        return ""

    # remove any anchor
    url = url.partition('#')[0]

    # check it's not an outside-of-the-tree link
    url = urlparse.urljoin(current_url, url)
    if not(url.startswith(base_url)):
        return ""

    # check it's an HTML page
    if urllib.urlopen(url).info().gettype() != 'text/html':
        return ""

    return url

# W3C validation
def url_w3c_validate(url):
    return urllib.urlopen(w3c_validator + 'check?uri=' + url).info().getheader('x-w3c-validator-status') == 'Valid'

##################################################
# main
##################################################
l = {base_url: True}
l_error = []
n = 0
for i in range(depth):
    for url in l.copy():
        if l[url]:
            n += 1
            current_url = url
            print n,
            print "-",
            print current_url,

            print " parsing...",
            parse_links(url)
            print "done -",

            print "validating...",
            is_valid = url_w3c_validate(url)
            print "done ->",
            if is_valid:
                print "Valid"
            else:
                l_error.append(url)
                print "Invalid"

            l[url] = False

#report
print """
-------------------------------------
URLs parsed: %d
URLS with invalid HTML: %d""" % (len(l), len(l_error))

for url in l_error:
    print url

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.