16.1 Parsing an HTML document
N.B. This code works for Python2
16.2 Open an HTML File
First instantiate a parser. The MyHTMLParser
class from which we instantiate parser
will need to be written before the code can be run.
Use Python's built in file module f
to open a file and check if the file mode is set to "read", if so, read the contents of the file into a contents
variable. Pass that variable into the as yet undefined parser.
We could use the urllib
to open a URL but here we are working with a local file.
class MyHTMLParser(HTMLParser): # This class needs to be written def main(): # instantiate the parser and feed it some HTML parser = MyHTMLParser() # open the sample HTML file and read it f = open("samplehtml.html") if f.mode == "r": contents = f.read() # read the entire file parser.feed(contents) if __name__ == "__main__": main() == "__main__": main()
16.3 Make MyHTMLParser a sub-class of HTMLParser
Import HTMLParser
. Pass the HTMLParser
into MyHTMLParser
. This makes MyHTMLParser
a sub-class of HTMLParser
. Now the methods of HTMLParser
can be over-ridden by MyHTMLParser
.
# import the HTMLParser module from HTMLParser import HTMLParser # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): # write this. def main(): # instantiate the parser and feed it some HTML parser = MyHTMLParser() # open the sample HTML file and read it f = open("samplehtml.html") if f.mode == "r": contents = f.read() # read the entire file parser.feed(contents) if __name__ == "__main__": main()
16.4 Handle HTML Comments
We add a method for parsing html comments by over-riding the handle_comment()
method of HTMLParse
in our subclass MyHTMLParser
When you pass the contents
variable into the parser (parser.feed(contents)
) the parser will call any of the superclass functions you have overridden in your subclass.
To have the parser do something with comments we just need to override the comments method in our subclass and instruct what we want done.
Now we can go and override the method for any element we want to deal with...
# import the HTMLParser module from HTMLParser import HTMLParser # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): # function to handle the processing of HTML comments def handle_comment(self, data): print "Encountered comment:", data pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] def main(): # instantiate the parser and feed it some HTML parser = MyHTMLParser() # open the sample HTML file and read it f = open("samplehtml.html") if f.mode == "r": contents = f.read() # read the entire file parser.feed(contents) if __name__ == "__main__": main()
16.5 Override more methods of the base class
Here we handle start tag, end tag and data, which is the text content of an html element.
# import the HTMLParser module from HTMLParser import HTMLParser # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): # function to handle an opening tag in the doc # this will be called when the closing ">" of the tag is reached def handle_starttag(self, tag, attrs): pos = self.getpos() # returns a tuple indication line and character print "At line: ", pos[0], " position ", pos[1] if attrs.__len__ > 0: print "\tAttributes:" for a in attrs: print "\t", a[0], "=", a[1] # function to handle the ending tag def handle_endtag(self, tag): print "Encountered an end tag:", tag pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] # function to handle character and text data (tag contents) def handle_data(self, data): print "Encountered some data:", data pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] # function to handle the processing of HTML comments def handle_comment(self, data): print "Encountered comment:", data pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] def main(): # instantiate the parser and feed it some HTML parser = MyHTMLParser() # open the sample HTML file and read it f = open("samplehtml.html") if f.mode == "r": contents = f.read() # read the entire file parser.feed(contents) if __name__ == "__main__": main()
The start tag method also takes an attributes argument def handle_starttag(self, tag, attrs):
So, if there are any attributes if attrs.__len__ > 0:
We're going to print them out
if attrs.__len__ > 0: print "\tAttributes:" for a in attrs: print "\t", a[0], "=", a[1]
16.6 Count the meta-tags in the head
Add a counter and set it zero metacount = 0;
Add
def handle_starttag(self, tag, attrs): global metacount print "Encountered a start tag:", tag if tag == "meta": metacount += 1
And lastly print out the number of metatags encountered
print "%d meta tags encountered" % metacount
Resulting in:
# import the HTMLParser module from HTMLParser import HTMLParser metacount = 0; # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): # function to handle an opening tag in the doc # this will be called when the closing ">" of the tag is reached def handle_starttag(self, tag, attrs): global metacount print "Encountered a start tag:", tag if tag == "meta": metacount += 1 pos = self.getpos() # returns a tuple indication line and character print "At line: ", pos[0], " position ", pos[1] if attrs.__len__ > 0: print "\tAttributes:" for a in attrs: print "\t", a[0], "=", a[1] # function to handle the ending tag def handle_endtag(self, tag): print "Encountered an end tag:", tag pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] # function to handle character and text data (tag contents) def handle_data(self, data): print "Encountered some data:", data pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] # function to handle the processing of HTML comments def handle_comment(self, data): print "Encountered comment:", data pos = self.getpos() print "At line: ", pos[0], " position ", pos[1] def main(): # instantiate the parser and feed it some HTML parser = MyHTMLParser() # open the sample HTML file and read it f = open("samplehtml.html") if f.mode == "r": contents = f.read() # read the entire file parser.feed(contents) print "%d meta tags encountered" % metacount if __name__ == "__main__": main()