#!/usr/bin/env python # Copyright (c) 2007, Mike Ramirez # # Checks a webpage and the links and responses associated with links. # Good for checking for dead links # # Usage: # $ ./linkchecker --help # or -h for help # $ ./linkchecker google.com # can use http:// or not # $ ./linkchecker google.com 200 # list all links with the response 200 # $ ./linkchecker google.com 4 # list all links the with response in the range of 4xx # # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # Neither the name of the ##linux-coders nor the names of its contributors # may be used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import sys, socket, formatter from urllib2 import Request, urlopen, URLError from htmllib import HTMLParser from sgmllib import SGMLParser from cStringIO import StringIO timeout = 10 passedpages = {} socket.setdefaulttimeout(timeout) # Table mapping response codes to messages; entries have the # form {code: (shortmessage, longmessage)}. responses = { 100: ('Continue', 'Request received, please continue'), 101: ('Switching Protocols', 'Switching to new protocol; obey Upgrade header'), 200: ('OK', 'Request fulfilled, document follows'), 201: ('Created', 'Document created, URL follows'), 202: ('Accepted', 'Request accepted, processing continues off-line'), 203: ('Non-Authoritative Information', 'Request fulfilled from cache'), 204: ('No Content', 'Request fulfilled, nothing follows'), 205: ('Reset Content', 'Clear input form for further input.'), 206: ('Partial Content', 'Partial content follows.'), 300: ('Multiple Choices', 'Object has several resources -- see URI list'), 301: ('Moved Permanently', 'Object moved permanently -- see URI list'), 302: ('Found', 'Object moved temporarily -- see URI list'), 303: ('See Other', 'Object moved -- see Method and URL list'), 304: ('Not Modified', 'Document has not changed since given time'), 305: ('Use Proxy', 'You must use proxy specified in Location to access this ' 'resource.'), 307: ('Temporary Redirect', 'Object moved temporarily -- see URI list'), 400: ('Bad Request', 'Bad request syntax or unsupported method'), 401: ('Unauthorized', 'No permission -- see authorization schemes'), 402: ('Payment Required', 'No payment -- see charging schemes'), 403: ('Forbidden', 'Request forbidden -- authorization will not help'), 404: ('Not Found', 'Nothing matches the given URI'), 405: ('Method Not Allowed', 'Specified method is invalid for this server.'), 406: ('Not Acceptable', 'URI not available in preferred format.'), 407: ('Proxy Authentication Required', 'You must authenticate with ' 'this proxy before proceeding.'), 408: ('Request Timeout', 'Request timed out; try again later.'), 409: ('Conflict', 'Request conflict.'), 410: ('Gone', 'URI no longer exists and has been permanently removed.'), 411: ('Length Required', 'Client must specify Content-Length.'), 412: ('Precondition Failed', 'Precondition in headers is false.'), 413: ('Request Entity Too Large', 'Entity is too large.'), 414: ('Request-URI Too Long', 'URI is too long.'), 415: ('Unsupported Media Type', 'Entity body in unsupported format.'), 416: ('Requested Range Not Satisfiable', 'Cannot satisfy request range.'), 417: ('Expectation Failed', 'Expect condition could not be satisfied.'), 500: ('Internal Server Error', 'Server got itself in trouble'), 501: ('Not Implemented', 'Server does not support this operation'), 502: ('Bad Gateway', 'Invalid responses from another server/proxy.'), 503: ('Service Unavailable', 'The server cannot process the request due to a high load'), 504: ('Gateway Timeout', 'The gateway server did not receive a timely response'), 505: ('HTTP Version Not Supported', 'Cannot fulfill request.'), } # from kelvie wong's post on ASPN Cookbook # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/168639 # class progressBar: """ Creates a text-based progress bar. Call the object with the `print' command to see the progress bar, which looks something like this: [=======> 22% ] You may specify the progress bar's width, min and max values on init. """ def __init__(self, minValue = 0, maxValue = 100, totalWidth=80): self.progBar = "[]" # This holds the progress bar string self.min = minValue self.max = maxValue self.span = maxValue - minValue self.width = totalWidth self.amount = 0 # When amount == max, we are 100% done self.updateAmount(0) # Build progress bar string def updateAmount(self, newAmount = 0): """ Update the progress bar with the new amount (with min and max values set at initialization; if it is over or under, it takes the min or max value as a default. """ if newAmount < self.min: newAmount = self.min if newAmount > self.max: newAmount = self.max self.amount = newAmount # Figure out the new percent done, round to an integer diffFromMin = float(self.amount - self.min) percentDone = (diffFromMin / float(self.span)) * 100.0 percentDone = int(round(percentDone)) # Figure out how many hash bars the percentage should be allFull = self.width - 2 numHashes = (percentDone / 100.0) * allFull numHashes = int(round(numHashes)) # Build a progress bar with an arrow of equal signs; special cases for # empty and full if numHashes == 0: self.progBar = "[>%s]" % (' '*(allFull-1)) elif numHashes == allFull: self.progBar = "[%s]" % ('='*allFull) else: self.progBar = "[%s>%s]" % ('='*(numHashes-1), ' '*(allFull-numHashes)) # figure out where to put the percentage, roughly centered percentPlace = (len(self.progBar) / 2) - len(str(percentDone)) percentString = str(percentDone) + "%" # slice the percentage into the bar self.progBar = ''.join([self.progBar[0:percentPlace], percentString, self.progBar[percentPlace+len(percentString):] ]) def __str__(self): return str(self.progBar) def __call__(self, value): """ Updates the amount, and writes to stdout. Prints a carriage return first, so it will overwrite the current line in stdout.""" print '\r', self.updateAmount(value) sys.stdout.write(str(self)) sys.stdout.flush() class LinkChecker(object): """ Simple clase to retrieve a webpage and get the links and responses associated with the links. Simple use: mylinks = LinkChecker(url) debugging mode: mylinks = LinkChecker(url, 1) returns a dictionary of responses, the numberic code is the key and a list item of the message, then links that return that code. The messages are listed above in the responses dictionary. debug and verbose are booleans, verbose isn't implemented, use debug for more info """ def __init__(self, url, debug=0, verbose=0): object.__init__(self) self.debug = debug self.verbose = verbose self.noPage = 0 self.url = url self.res = self._openUrl() if self.debug: print '%s' %self.res try: self.htmlContents = self.res.read() except: self.noPage = 1 return if self.debug: print '%s' %self.htmlContents self.parsedhtml = self._readHtml() self.realUrl = self.res.geturl() if self.debug: print '%s' %self.realUrl self.headers = self.res.headers if self.debug: print '%s' % self.headers self.title = self.parsedhtml.title self._getLinks() self.totallinks = self.absolutelinks+self.prefixUrlToRelativeLinks() self.errorpages = {} self._removeDupLinks() def _openUrl(self, url=None): """ Function to open url, returns a urllib resource """ if self.debug: print 'Inside _openUrl()' if not url: url = self.url request = Request(url) if self.debug: print 'Trying to open %s' % self.url try: res = urlopen(request) except URLError, e: res = {} if hasattr(e, 'reason'): res['error'] = e.reason return res elif hasattr(e, 'code'): res['error'] = e.code return res else: pass return res ## set up formqtter and read html def _textFormatter(self): emptyformatter = formatter.DumbWriter(StringIO()) htmlformatter = formatter.AbstractFormatter(emptyformatter) return HTMLParser(htmlformatter) def _readHtml(self): parsedhtml = self._textFormatter() parsedhtml.feed(self.htmlContents) return parsedhtml def _getLinks(self): """ Parses the html and gets the links and creates the lists seperates relative/absolute and email links """ if self.debug: print 'This is parsed html %s' % self.parsedhtml self.relativelinks = [] self.absolutelinks = [] self.emaillinks = [] for item in self.parsedhtml.anchorlist: if item[:6] == 'mailto': self.emaillinks.append(item) elif item[0] == '/' or item[0] == '.' or item[0] == '#' or item[0] == '?' or item[:4] != 'http': self.relativelinks.append(item) else: self.absolutelinks.append(item) def _removeDupLinks(self): """ Removes dups, Might want to have this handle the dups list to return a list of links that are dups """ self.dups = [] self.uniques = [] for item in self.totallinks: try: self.uniques.index(item) self.dups.append(item) except: self.uniques.append(item) def prefixUrlToRelativeLinks(self): """ Presentation use, mainly to give working urls for relative links on the main page. """ templinks = [] for item in self.relativelinks: if item[:2] == '//': myurl = 'http:%s' %(item) elif item[0] == '#' or item[0] == '?': myurl = '%s/%s' %(self.url, item) elif item[:1] == '.': myurl = '%s%s' %(self.url, item[1:]) elif item[:1] == '/': myurl = '%s%s' %(self.url, item) elif item[:4] != 'http': myurl = '%s/%s' %(self.url, item) else: myurl = '%s%s' %(self.url, item) templinks.append(myurl) return templinks def checkLinks(self, myurl=False): """ Ties everthing to together to get all links myurl=False is used to allow you to use this function without initializing the class. if myurl isn't set then it will use self.url you initialized, good for recursing through response -113 is for html with no anchor tags """ if not myurl: myurl = self.url if self.debug: print '\nChecking %s links:\n%s' %(len(self.uniques), '-'*50) y = 0 if self.debug: print len(self.uniques) if len(self.uniques) <= 0: responses[-113] = ('No Links Found', self.url,) return responses # follwoing is to setup the progbar, # it fails with a large list so we split up update time if len(self.uniques) <= 249: u = 5 elif len(self.uniques) >= 250 and len(self.uniques) <=499: u = 10 else: u = 25 prog = progressBar(y, len(self.uniques)) print prog # the real work is here # takes the list of links and gets the response code # and adds the url to the set. # returns back responses with urls for myurl in self.uniques: y +=1 if self.debug: print 'Checking %s out of %s - %s' %(y, len(self.uniques), myurl) self.url = myurl self.res = self._openUrl() try: if responses.has_key(self.res.code): responses[self.res.code] += (self.url,) else: responses[self.res.code] += ('Unknown Response Code', 'New Response, or did you fuck up?', self.url,) except: try: a = int(self.res['error'][0]) if responses.has_key(int(self.res['error'][0])): responses[int(self.res['error'][0])] += (self.url,) else: responses[int(self.res['error'][0])] = (self.res['error'][1], self.url) except: if self.res['error'] == 'timed out': responses[408] += (self.url,) try: if responses.has_key(int(self.res['error'])): responses[self.res['error']] += (self.url,) except: # set response -113, not official so that's why not in the list above if responses.has_key(-113): responses[-113] += (self.url,) else: responses[-113] = ('Unknown Type: %s\tError: %s' % (type(self.res['error']), self.res['error']), self.url,) if self.debug: print self.res['error'] print type(self.res['error']) if y == 5 and u != 5: prog.updateAmount(y) print prog if y%u == 0: prog.updateAmount(y) print prog return responses def getLinksWithResponses(self, responses): """ list all links with responses and codes output formatting """ for k,v in responses.iteritems(): try: a = len(v[2]) # should break here with 0 links print "" print '%s Response:' % k if str(k)[0] == '-': print '\tMessage: %s' % v[0] linklist = v[1:] else: print '\tMessage: %s - %s' %(v[0], v[1]) linklist = v[2:] print '\tTotal links: %s' % len(linklist) print '\tLink List:' y = 0 for x in linklist: y += 1 print '\t\t%s. %s' %(y, x) except: pass if self.debug: print 'Exiting getResponses' def getSpecificResponse(self, responses, respCode): """ Should allow checking responses for a specific code and url, more output processing/formatting. """ if not respCode: print 'Please Specificy a response code' if responses.has_key(respCode): v = responses[respCode] print '\n%s Response:' % respCode print '\tMessage: %s - %s' %(v[0], v[1]) try: a = len(v[2]) # should break here with 0 links linklist = v[2:] print '\tTotal links: %s' % len(linklist) print '\tLink List:' y = 0 for x in linklist: y += 1 print '\t\t%s. %s' %(y, x) except: print '\t\tNo links responded with %s' % respCode def getResponseRange(self, mylinks, respCode): """ Prints the range of codes and responses. """ keys = mylinks.keys() if self.debug: print 'keys: %s' % keys print 'respCode: %s' % respCode for key in keys: if str(key)[0] == respCode: v = mylinks[key] print '\n%s Response:' % key print '\tMessage: %s - %s' %(v[0], v[1]) try: a = len(v[2]) linklist = v[2:] print '\tTotal links: %s' % len(linklist) print '\tLink List:' y = 0 for x in linklist: y += 1 print '\t\t%s. %s' %(y, x) except: print 'No links responded with %s' % key def show_help(): print 'Basic usage: linkchecker [url] [responsecode]\n' print 'The url is required, only supports http or https.\n\tIf you do not add http:// it will be added for you\n' print 'Responsecode is any valid httpd response value. \n\tif not used, linkchecher will display all links returning a response' print '\nSample usage: linkchecker google.com' print 'You can specify a range of codes or specific code to check\n\tfor example if you only want to see all pages with a response in the 400s: ' print '\tlinckhecker linux-coders.org 200' print '\t\tShows all links with response 200' print '\tlinkchecker linux-coders.org 4' print '\t\tReturns all links with the response in 4xx' print '\t\t 1,2,3,4,5 are valid ranges' print 'linkchecker [--help|-h] - displays this' if __name__ == '__main__': try: myarg = sys.argv[1] except: show_help() sys.exit() ## check for help and add http:// to url if imissing if sys.argv[1] == '--help' or sys.argv[1] == '-h': show_help() sys.exit() if myarg[:4] != 'http': myarg = 'http://%s' % myarg ## initilaze LinkChecker() myLc = LinkChecker(myarg) ## if page is unavalibls if myLc.noPage: print '\n\t%s is Unavailable' % (myLc.url) for k,v in myLc.res.iteritems(): if responses.has_key(int(v)): print '\tError: %s - %s - %s' %(int(v),responses[int(v)][0],responses[int(v)][1]) else: print '\t%s %s' %(k,v) sys.exit() ## checkin links mylinks = myLc.checkLinks(myarg) ## base info included with all output print 'Url to Check:\t\t%s' % myarg print 'Actual Url Checked:\t%s' % myLc.realUrl print 'Page Title:\t\t%s' % myLc.title print '\nHeaders:\n%s\n' % myLc.headers print 'Total links checked:\t\t\t%s' % len(myLc.totallinks) print 'Number of Links with absolute urls:\t%s' % len(myLc.absolutelinks) print 'Number of links with relative urls:\t%s' % len(myLc.relativelinks) print 'Number of Uniques:\t\t\t%s' % len(myLc.uniques) print 'Number of Duplicate links:\t\t%s' % len(myLc.dups) if myLc.errorpages: print 'Error Pages:' for k,v in myLc.errorpages.iteritems(): print '\t%s\n\t\t%s' % (k,v) if myLc.emaillinks: print 'Email links:' z = 0 for item in myLc.emaillinks: z += 1 print '\t\t%s. %s' %(z, item) ## should implement optparse for handling the different outputs ## get responses if len(sys.argv[2]) == 3: myLc.getSpecificResponse(mylinks, int(sys.argv[2])) elif sys.argv[2] == '1' or sys.argv[2] == '2' or sys.argv[2] == '3' or sys.argv[2] == '4' or sys.argv[2] == '5': myLc.getResponseRange(mylinks, sys.argv[2]) else: myLc.getLinksWithResponses(mylinks)