#!/usr/bin/python # import urllib2 to download web pages import urllib2 import sys httpResponse = urllib2.urlopen(sys.argv[1]) htmlPage = httpResponse.read() #print htmlPage # lets feed the htmlpage to a parser # we can use DOM or Stream parser # will use DOM in this example from BeautifulSoup import BeautifulSoup htmlDom = BeautifulSoup(htmlPage) # dump page title print htmlDom.title.string # dump all links in page #allLinks = htmlDom.findAll('a', {'href': True}) #for link in allLinks: # print link['href'] # print all the comments in the html page from BeautifulSoup import Comment allComments = htmlDom.findAll(text = lambda text:isinstance(text, Comment)) for eachComment in allComments : print eachComment # dump all the text, individual tags ...