16.1 Parsing an HTML document

N.B. This code works for Python2

16.2 Open an HTML File

First instantiate a parser. The MyHTMLParser class from which we instantiate parser will need to be written before the code can be run.

Use Python's built in file module f to open a file and check if the file mode is set to "read", if so, read the contents of the file into a contents variable. Pass that variable into the as yet undefined parser.

We could use the urllib to open a URL but here we are working with a local file.

class MyHTMLParser(HTMLParser):
    # This class needs to be written
 
 
def main():
 
    # instantiate the parser and feed it some HTML
    parser = MyHTMLParser()
 
    # open the sample HTML file and read it
    f = open("samplehtml.html")
    if f.mode == "r":
        contents = f.read()  # read the entire file
        parser.feed(contents)
 
if __name__ == "__main__":
    main() == "__main__":
        main()

16.3 Make MyHTMLParser a sub-class of HTMLParser

Import HTMLParser. Pass the HTMLParser into MyHTMLParser. This makes MyHTMLParser a sub-class of HTMLParser. Now the methods of HTMLParser can be over-ridden by MyHTMLParser.

# import the HTMLParser module
from HTMLParser import HTMLParser
 
 
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    # write this.
 
 
def main():
 
    # instantiate the parser and feed it some HTML
    parser = MyHTMLParser()
 
    # open the sample HTML file and read it
    f = open("samplehtml.html")
    if f.mode == "r":
        contents = f.read()  # read the entire file
        parser.feed(contents)
 
if __name__ == "__main__":
    main()

16.4 Handle HTML Comments

We add a method for parsing html comments by over-riding the handle_comment() method of HTMLParse in our subclass MyHTMLParser

When you pass the contents variable into the parser (parser.feed(contents)) the parser will call any of the superclass functions you have overridden in your subclass.

To have the parser do something with comments we just need to override the comments method in our subclass and instruct what we want done.

Now we can go and override the method for any element we want to deal with...

# import the HTMLParser module
from HTMLParser import HTMLParser
 
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
 
    # function to handle the processing of HTML comments
    def handle_comment(self, data):
        print "Encountered comment:", data
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
 
def main():
 
    # instantiate the parser and feed it some HTML
    parser = MyHTMLParser()
 
    # open the sample HTML file and read it
    f = open("samplehtml.html")
    if f.mode == "r":
        contents = f.read()  # read the entire file
        parser.feed(contents)
 
if __name__ == "__main__":
    main()

16.5 Override more methods of the base class

Here we handle start tag, end tag and data, which is the text content of an html element.

# import the HTMLParser module
from HTMLParser import HTMLParser
 
# create a subclass and override the handler methods
 
 
class MyHTMLParser(HTMLParser):
 
    # function to handle an opening tag in the doc
    # this will be called when the closing ">" of the tag is reached
    def handle_starttag(self, tag, attrs):
        pos = self.getpos()  # returns a tuple indication line and character
        print "At line: ", pos[0], " position ", pos[1]
        if attrs.__len__ > 0:
            print "\tAttributes:"
            for a in attrs:
                print "\t", a[0], "=", a[1]
 
    # function to handle the ending tag
    def handle_endtag(self, tag):
        print "Encountered an end tag:", tag
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
    # function to handle character and text data (tag contents)
    def handle_data(self, data):
        print "Encountered some data:", data
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
    # function to handle the processing of HTML comments
    def handle_comment(self, data):
        print "Encountered comment:", data
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
 
def main():
 
    # instantiate the parser and feed it some HTML
    parser = MyHTMLParser()
 
    # open the sample HTML file and read it
    f = open("samplehtml.html")
    if f.mode == "r":
        contents = f.read()  # read the entire file
        parser.feed(contents)
 
if __name__ == "__main__":
    main()

The start tag method also takes an attributes argument def handle_starttag(self, tag, attrs):

So, if there are any attributes if attrs.__len__ > 0:

We're going to print them out

if attrs.__len__ > 0:
    print "\tAttributes:"
    for a in attrs:
        print "\t", a[0], "=", a[1]

16.6 Count the meta-tags in the head

Add a counter and set it zero metacount = 0;

Add

def handle_starttag(self, tag, attrs):
    global metacount
    print "Encountered a start tag:", tag
    if tag == "meta":
        metacount += 1

And lastly print out the number of metatags encountered

 print "%d meta tags encountered" % metacount

Resulting in:

# import the HTMLParser module
from HTMLParser import HTMLParser
 
 
metacount = 0;
 
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
 
    # function to handle an opening tag in the doc
    # this will be called when the closing ">" of the tag is reached
    def handle_starttag(self, tag, attrs):
        global metacount
        print "Encountered a start tag:", tag
        if tag == "meta":
            metacount += 1
 
        pos = self.getpos()  # returns a tuple indication line and character
        print "At line: ", pos[0], " position ", pos[1]
        if attrs.__len__ > 0:
            print "\tAttributes:"
            for a in attrs:
                print "\t", a[0], "=", a[1]
 
    # function to handle the ending tag
    def handle_endtag(self, tag):
        print "Encountered an end tag:", tag
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
    # function to handle character and text data (tag contents)
    def handle_data(self, data):
        print "Encountered some data:", data
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
    # function to handle the processing of HTML comments
    def handle_comment(self, data):
        print "Encountered comment:", data
        pos = self.getpos()
        print "At line: ", pos[0], " position ", pos[1]
 
 
def main():
 
    # instantiate the parser and feed it some HTML
    parser = MyHTMLParser()
 
    # open the sample HTML file and read it
    f = open("samplehtml.html")
    if f.mode == "r":
        contents = f.read()  # read the entire file
        parser.feed(contents)
 
     print "%d meta tags encountered" % metacount
 
if __name__ == "__main__":
    main()