from htmlentitydefs import entitydefs
import HTMLParser
class TitleParser(HTMLParser.HTMLParser):
def __init__(self):
self.taglevels=[]
self.handledtags=['title','body']
self.processing=None
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag in self.handledtags:
self.data=''
self.processing=tag
if tag =='a':
for name,value in attrs:
if name=='href':
print '连接地址:'+value
def handle_data(self,data):
if self.processing:
self.data +=data
def handle_endtag(self,tag):
if tag==self.processing:
print str(tag)+':'+str(tp.gettitle())
self.processing=None
def handle_entityref(self,name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&'+name+';')
def handle_charref(self,name):
try:
charnum=int(name)
except ValueError:
return
if charnum<1 or charnum>255:
return
self.handle_data(chr(charnum))
def gettitle(self):
return self.data
fd=open('test1.html')
tp=TitleParser()
tp.feed(fd.read())
运行结果为:
title: XHTML 与" HTML 4.01 "标准没有太多的不同
连接地址:http://pypi.python.org/pypi
body:
i love÷ you×










