#!/usr/bin/env python # -*- mode: python-mode -*- usage = r''' Direction for use: Change to your personal cs141 work directory, then run update_repository. This will create (or update) the sub-directory "repository" and its contents from the web site. $ cd ~/cs141 $ update_repository $ ls repository ''' ######################################################################## ######################################################################## import os, md5, urllib2, rfc822, StringIO, urlparse import sys, urllib2 from HTMLParser import HTMLParser repository_url = 'http://www.cs.ucr.edu/~neal/2005/cs141/repository/' repository_here = 'repository' if not os.path.isdir(repository_here): os.makedirs(repository_here) ######################################################################## ## ## HTTPCache ## cacheSubDir__ = '.httpcache_py' if not os.path.isdir(cacheSubDir__): os.makedirs(cacheSubDir__) class HTTPCache: ''' Represents a single cached URL Requires Python 2.2 or later. Based on httpcache.py from http://bitworking.org/projects/httpcache/httpcache.py.txt: __author__ = "Joe Gregorio (joe@bitworking.org)" __copyright__ = "Copyright 2004, Joe Gregorio" __contributors__ = ["Kendall Clark", "Beat Bolli"] __version__ = "1.0.2 $Rev: 33 $" __license__ = "MIT" ''' def __init__(self, url, headers={}): self.info_ = None self.content_ = None self.fresh_ = False self.url_ = url #Create a non-clashing name for each url in the cache. digest = md5.new(url).digest() cacheFileName = "".join(["%02x" % (ord(c),) for c in digest]) self.cacheFullPath_ = os.path.join(cacheSubDir__, cacheFileName) self.headers_ = {} self.headers_.update(headers) if (os.path.exists(self.cacheFullPath_)): # Load up the cached version and use it's 'ETag' header value, if it exists. f = file(self.cacheFullPath_, "r") self.info_ = rfc822.Message(f) f.seek(0) self.content_ = f.read().split('\n\n', 1)[1] f.close() request = urllib2.Request(url, None, self.headers_) if self.info_.has_key('ETag'): request.add_header("If-None-Match", self.info_['ETag']) try: response = urllib2.urlopen(request) except urllib2.HTTPError, e: if (304 == e.code): self.fresh_ = True else: raise urllib2.HTTPError, e else: info = response.info() for key in info.keys(): self.info_[key] = info[key] self.content_ = self._writeContent(self.info_, response) else: # There isn't a cached version of this URL yet. request = urllib2.Request(url, None, self.headers_) response = urllib2.urlopen(request) response.info()['Url'] = url self.content_ = self._writeContent(response.info(), response) self.info_ = response.info() def content(self): """Get the content as a single string.""" return self.content_ def filename(self): """Get the full path file name of the cached file.""" return self.cacheFullPath_ def fresh(self): """Get the state of the cache; if true, the cached copy is fresh; if false, it's stale.""" return self.fresh_ def info(self): """Returns and rfc822.Message for manipulating headers. Note that you can use this to read headers but not to add or change headers. Use the 'add_headers()' for adding/changing header values permanently in the cache.""" return self.info_ def add_headers(self, headers): """Add/change header values in the cache. Note that if the key/value pair you change is used by HTTP then you risk the possibility that the value will be over-written the next time content is retrieved from that URL. """ for key in headers.keys(): self.info_[key] = headers[key] f = file(self.cacheFullPath_, "w") f.write(str(self.info_)) f.write("\n") f.write(self.content_) f.close() def _writeContent(self, info, response): f = file(self.cacheFullPath_, "w") f.write(str(info)) f.write("\n") content = "" content = response.read() f.write(content) f.close() return content ############################################################ END HTTPCache def fetch_hrefs(url): '''Fetch the urls from within the document at given url.''' urls = [] class parser(HTMLParser): def handle_starttag(self, tag, attrs): if tag == "a": for (x,y) in attrs: if (x=='href'): urls.append(y) try: f = urllib2.urlopen(url) except: print "Warning, unable to open", url, "!" return p = parser() p.feed(f.read()) p.close() return urls seen = {} verbose = 0 if len(sys.argv) > 1 and sys.argv[1] in ['-v', '--verbose']: verbose = 1 update_repository_modified = 0 def fetch(url, basename): # return if cgi or seen before if url.find("/?") != -1 or url in seen: return seen[url] = 1 #print url, "-->", basename if (os.path.isfile(basename)): try: os.remove(basename) except: print "Error: unable to replace old copy of", basename, "." raise if url[-1] == "/": # directory, recurse if (not os.path.isdir(basename)): try: if (verbose): print "mkdir", basename os.makedirs(basename) except: print "Error: unable to create directory", basename, "." raise for u in fetch_hrefs(url): full = urlparse.urljoin(url, u) if u[-1] == "/": u = u[0:-1] if full.find(url) == 0: fetch(full, basename + "/" + u) else: # file, get it cache = HTTPCache(url) # "Error: unable to fetch", url, "." try: fd = open(basename, "w") except: print "Error: unable to write ", basename, "." raise if not cache.fresh(): if verbose: print basename, "new or changed." else: print basename if basename.find('/bin/update_repository'): update_repository_modified = 1 elif verbose: print basename, "refreshed from cache." fd.write(cache.content_) fd.close() if basename.find("/bin/"): os.chmod(basename, 0755) if verbose: pass #print "Updated", basename, "." ########################################################################### new_dir = repository_here + ".%d" % os.getpid() current_dir = repository_here old_dir = repository_here + ".backup" try: # fetch repository try: fetch(repository_url, new_dir) except: print "Problem fetching repository!" raise # remove previous backup try: os.system("/bin/rm -rf " + old_dir) except: print "Huh, couldn't remove", old_dir, "." raise if os.path.exists(old_dir): print "Huh, couldn't remove", old_dir, "." print "Check your file permissions, maybe?" raise # rename current to backup try: if verbose: print "mv", current_dir, old_dir os.rename(current_dir, old_dir) except: print "Huh, rename of", current_dir, "to", old_dir, "failed?" print "Check your file permissions, maybe?" raise # rename new to current try: if verbose: print "mv", new_dir, current_dir os.rename(new_dir, current_dir) except: print "Huh, rename of", new_dir, "to", current_dir, " failed?" print "Check your file permissions, maybe?" raise if update_repository_modified: print "There's a new update_repository command." print "Creating a link to it here..." try: if (os.path.isfile("update_repository.backup")): os.remove("update_repository.backup") os.rename("update_repository", "update_repository.backup") cmd = "/bin/ls -s " + current_dir + "/bin/update_repository ." print '$', cmd os.system(cmd) except: print "Failed to create the link?" except: print "Failed to update your repository!" print print "################################" raise print current_dir, "succesfully updated."