import htmllib, formatter,re,string
import urllib, htmllib, sys
class LinksExtractor(htmllib.HTMLParser): # derive new HTML parser
def __init__(self, formatter) : # class constructor
htmllib.HTMLParser.__init__(self, formatter) # base class constructor
self.links = [] # create an empty list for storing hyperlinks
def start_a(self, attrs) : # override handler of ... tags
# process the attributes
if len(attrs) > 0 :
for attr in attrs :
if attr[0] == "href" : # ignore all non HREF attributes
self.links.append(attr[1]) # save the link info in the list
def get_links(self) : # return the list of extracted links
return self.links
format = formatter.NullFormatter() # create default formatter
htmlparser = LinksExtractor(format) # create new parser object
if len(sys.argv) < 2:
print "ERROR: Provide linklist filename"
sys.exit(0)
if len(sys.argv) == 2:
sys.argv = sys.argv + ['']
allLinks = []
linklist = open(sys.argv[1],'r')
while True:
link = linklist.readline()
if not link:
break
print "now processing: " + link.rstrip()
data = urllib.urlopen(link.rstrip())
#if sys.argv[2] == 'see':
# while True:
# sent = data.readline()
# if 'a name="See_also"' in sent:
# htmlparser.feed(data.read()) # parse the file saving the info about links
# break
# if '