#!/usr/bin/env python '''Script to see if all destinations of a link on a webpage are still the same as before. For more information, see $ python linkvalidator.py -h ''' __version__='1.0' __date__='November 2011' __author__='Vincent Van Asch' #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Path to opendiff binary opendiff = 'opendiff' #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ import sys, os, urllib2, re import shutil import time import tempfile import getopt import subprocess linkpattern = re.compile(''']*href\s*=\s*(['"])([^'"]+)\\1\s*[^>]*>[^<]*''') def expandurl(url, base=None): '''Adds http:// to url if needed If a url does not start with www, 2 urls are returns, one with base and one without. Cuts the part after the # ''' if base: base = base.rstrip('/')+'/' urls = [url.strip()] urls=[] if not re.match('(http|ftp)://', url): if base and not url.startswith('www'): urls.append(base + url) urls.append('http://'+ url) else: urls.append(url) for i,u in enumerate(urls): if u.count('#') ==1: urls[i] = u[:u.index('#')] elif u.count('#') > 1: raise ValueError('Do not know what to do with url containing multiple #: %s' %u) return urls def getpage(url, base=None, verbose=False): '''Takes a url and returns the data from that page as a string. If the page could not be retrieved, returns empty string. ''' page=None for u in expandurl(url, base): if verbose: print 'Retrieving %s ...' %u, try: page = urllib2.urlopen(u) except Exception, e: if verbose: print '[failed]' else: if verbose: print '[ok]' if not page: return None, '' #print page.geturl() #print page.info() # all newlines to whitespace txt = page.read() txt = ' '.join(txt.split()) return page.geturl(), txt def getlinks(url): '''Returns a list of unique links on a page''' base, page = getpage(url) if not base: return [] links=[] for m in linkpattern.finditer(page): links.extend( expandurl(m.group(2), base) ) links = list(set(links)) #print 'LINKS', len(links), '\n'.join(links) return links def interesting(l): '''Returns True if the line is interesting for comparing; otehrwise False''' if re.match('id="form-', l): return False if re.match('value="form-', l): return False return True def store(url, data, dir): '''Tries to clean up a little when storing''' try: fd, name = tempfile.mkstemp(dir=dir) os.write(fd, url+'\n') os.write(fd, time.strftime('%d/%m/%Y %H:%M:%S')+'\n') for line in data.strip().split(): if interesting(line): os.write(fd, line+'\n') finally: os.close(fd) return name def lread(fname): with open(fname) as f: lines = f.readlines() url = lines[0].strip() date = lines[1].strip() data=' '.join(lines[2:]) return url, date, data def downloadlinks(links, cachedir): '''Tries to download all pages in links and stores them in a tempfile''' if not os.path.exists(cachedir): os.mkdir(cachedir) else: #print 'Removing', cachedir shutil.rmtree(cachedir) os.mkdir(cachedir) data = {} for link in links: url, page = getpage(link) if url: fname = store(url, page, cachedir) data[url] = fname #print 'Found %d retrievable links' %len(data) return data def urlexists(url): try: urllib2.urlopen(url) except Exception: return False return True def snapshot(url, dir): '''Stores a snapshot of the link on the webpage url in the folder dir.''' url = url.rstrip('/') if not urlexists(url): raise ValueError('Could not retrieve %s' %url) # Get all links links = getlinks(url) if not links: raise ValueError('No retrievable links found on %s' %url) # Store the data in dir dir = os.path.abspath(os.path.expanduser(dir)) data = downloadlinks(links, dir) # Store the metadata name = os.path.join(dir, 'metadata') with open(name, 'w') as f: f.write('# SNAPSHOT: %s\n' %url.rstrip('/')) for l in data.items(): f.write('%s\t%s\n' %l) print 'Snaphost saved in %s' %dir def check(url, dir, inspect=True, verbose=False): '''check to see if the links on url are still the same as when the snapshot dir has been taken Return True if someting changed, otherwise False ''' dir = os.path.abspath(os.path.expanduser(dir)) url = url.rstrip('/') if not urlexists(url): raise ValueError('Could not retrieve %s' %url) # A dir to store the new snapshot cachedir = tempfile.mkdtemp(prefix='linkvalidator-') if not os.path.isdir(dir): raise ValueError('%s does not exist' %dir) # Read in metadata refurl=None n = os.path.join(dir, 'metadata') if not os.path.isfile(n): raise ValueError('No metadata file is present in %s' %dir) ref={} ignores={} with open(n) as f: for l in f: line = l.strip() if line: if line.startswith('# SNAPSHOT'): refurl = ' '.join(line.split()[2:]) continue u, fname = line.split('\t') if u.startswith('#'): ignores[u.lstrip('#')] = fname else: ref[u] = fname if url != refurl: raise ValueError('url of snaphot (%s) is not the same as %s' %(refurl, url)) # Get all links links = getlinks(url) # Take new snapshot new = downloadlinks(links, cachedir) # Check link per link contentdiff=[]; same=[]; removed=[] refdate=None for link, fname in ref.items(): if link in new.keys(): newlink, newdate, newpage = lread(new[link]) reflink, refdate, refpage = lread(fname) if refpage != newpage: contentdiff.append(link) if inspect: try: subprocess.call([opendiff, ref[link], new[link]]) except OSError: raise ValueError('Could not inspect files because "opendiff" binary is not found') else: same.append(link) else: removed.append(link) added=[] for link in new.keys(): if link not in ref.keys() and link not in ignores.keys(): added.append(link) if not refdate: raise ValueError('No links from %s are in the snapshot. Maybe snaphshot is taken for a different url' %url) # Report if verbose: print '''Retrieved %d links during snapshot at %s Links with same contents : %d%s Links ignored during validation: %d Links with other contents : %d%s Links removed from webpage : %d%s Links added to webpage : %d%s''' %(len(ref)+len(ignores), refdate, \ len(same), report(same), \ len(ignores), \ len(contentdiff), report(contentdiff), \ len(removed), report(removed), \ len(added), report(added)) shutil.rmtree(cachedir) if contentdiff or removed or added: return True return False def report(l): if not l: return '' return '\n'.join(['']+[' '+x for x in l]) def _usage(): print >>sys.stderr, '''Script to check whether the contents of the links on a webpage is still the same (version %s) USAGE python linkvalidator.py [-v] [-i] [-s] folder url folder: a folder where a snapshot should be stored/can be found url: url of webpage from which the links are checked -s : take snapshot instead of checking -i : try to open the changed webpages with opendiff opendiff should be installed; this is normally the case if FileMerge is installed (Mac OSX) -v : print more info when validating WORKFLOW First, a snapshot should be taken from the contents of the links on the webpage. This snapshot will be saved in the specified folder: $ python linkvalidator.py -s snapshot-08112011 http://www.clips.ua.ac.be At any moment, the contents can be validate against this snapshot: $ python linkvalidator.py snapshot-08112011 http://www.clips.ua.ac.be IGNORING LINKS It is possible to ignore links during validation. For this you should edit the metadata file that is created in the folder when the snapshot is taken. Adding a # at the start of a line ensures that this link is not validated. This can be helpfull for dynamic webpages. NOTES Pages with dynamic content do always have differences. You may still want to check these yourself. %s, %s ''' %(__version__, __author__, __date__) if __name__ == '__main__': try: opts,args=getopt.getopt(sys.argv[1:],'shiv', ['help']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) take_snapshot=None inspect=False verbose=False for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o in ('-s',): take_snapshot = True if o in ('-i',): inspect = True if o in ('-v',): verbose = True if len(args) != 2: _usage() sys.exit(1) snapshotdir = args[0] url = args[1] try: if take_snapshot: snapshot(url, snapshotdir) else: answer = check(url, snapshotdir, inspect=inspect, verbose=verbose) print 'Has the contents of any of the link changed?', answer except Exception, e: print >>sys.stderr, 'ERROR:', e.message sys.exit(1)