python网络编程学习笔记(七):HTML和XHTML解析(HTMLParser、BeautifulSo

2019-10-06 14:22:10王旭

from htmlentitydefs import entitydefs
import HTMLParser
class TitleParser(HTMLParser.HTMLParser):
    def __init__(self):
        self.taglevels=[]
        self.handledtags=['title','body']
        self.processing=None
        HTMLParser.HTMLParser.__init__(self)       
    def handle_starttag(self,tag,attrs):
        if tag in self.handledtags:
            self.data=''
            self.processing=tag
        if tag =='a':
            for name,value in attrs:
                if name=='href':
                    print '连接地址:'+value
    def handle_data(self,data):
        if self.processing:
            self.data +=data
    def handle_endtag(self,tag):
        if tag==self.processing:
            print str(tag)+':'+str(tp.gettitle())
            self.processing=None
    def handle_entityref(self,name):
        if entitydefs.has_key(name):
            self.handle_data(entitydefs[name])
        else:
            self.handle_data('&'+name+';')

    def handle_charref(self,name):
        try:
            charnum=int(name)
        except ValueError:
            return
        if charnum<1 or charnum>255:
            return
        self.handle_data(chr(charnum))

    def gettitle(self):
        return self.data
fd=open('test1.html')
tp=TitleParser()
tp.feed(fd.read())

运行结果为:
title: XHTML 与" HTML 4.01 "标准没有太多的不同
连接地址:http://pypi.python.org/pypi
body:

i love÷ you×