#!/usr/bin/env python
## Mat Kovach (matkovach@gmail.com)
## GPLv2
## 20080601 -- MEK
## First version. Just like me it is ugly, lacks refinement
## and full of bugs.
## This may or may not be part of the RetroSQL (retrosql@yahoogroups.com)
##
## BEGIN CODE HERE
import os
import sys
import getopt
import urllib2
import zipfile
from urlparse import urljoin
import time
def geturl(url):
try:
in_file = urllib2.urlopen(url)
except urllib2.URLError, msg:
print "Error, %s" % ( msg )
return False
except socket.error, (errno, strerror):
print "Socket error (%s) for url %s (%s)" % (errno, url, strerror)
return False
return in_file
def unzipfile(file):
zf = zipfile.ZipFile(file)
for i, name in enumerate(zf.namelist()):
if not name.endswith('/'):
print "Extracting %s" % name
try:
outfile = open(os.path.join(os.path.dirname(file), name), 'wb')
outfile.write(zf.read(name))
except IOError,msg:
print "Error: %s" % ( msg )
return False
outfile.flush()
outfile.close()
def usage():
print "Usage: %s -h --help -d --debug -k --keep -o --output=" % (os.path.basename(sys.argv[0]))
print "\t-h, --help Print this message"
print "\t-d, --debug Show debugging info"
print "\t-o
, --output= Download/unzip into directory "
print "\tDefaults to: %s" % (os.path.join(os.getcwd(),"retrosheet-files/"))
print "\t-k, --keep Keep zipfiles, do not delete."
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:dk", ["help", "debug", "output=","keep"])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
output = False
debug = False
keep = False
for option, arg in opts:
if option in ("-d","--debug"):
debug = True
elif option in ("-h","--help"):
usage()
sys.exit()
elif option in ("-o","--output"):
output = arg
elif option in ("-k","--keep"):
keep = True
else:
assert False, "unhandled option"
if output == False:
output = os.path.join(os.getcwd(),"retrosheet")
output = os.path.abspath(output)
print "DEBUG: %s, %s, %s" % (keep,debug,output)
if os.path.exists(output) == False:
print "%s does not exists, creating output directory" % ( output )
try:
os.mkdir(output)
except OSError, e:
# Ignore directory exists error
if e.errno <> errno.EEXIST:
raise
else:
if os.path.isfile(output) == True:
print "%s is a file, exiting ... " % ( output )
sys.exit()
base_url = "http://www.retrosheet.org"
this_year = int(time.strftime('%Y'))
for year in range(1950,this_year):
y = str(year)
if year < 1997:
types = ['al','nl']
else:
types = ['ml']
for type in types:
web_file = y + '/' + y + type + '.zip'
url = urljoin(base_url,web_file)
print "Working on %s, %s" % (year, type)
in_file = geturl(url)
if in_file == False:
print "Error, skipping %s" % (url.split('/')[-1])
continue
filename = url.split('/')[-1]
file = os.path.join(output,filename)
if os.path.exists(file):
if os.path.isdir(file):
print "%s exists, but is a directory!" % ( filename )
sys.exit()
else:
try:
os.remove(file)
except Error,msg:
print "%s, %s" % ( Error,msg )
print "Removed previous %s" % ( filename )
try:
out_file = open(file, "wb")
out_file.write(in_file.read())
out_file.flush()
out_file.close()
in_file.close()
except IOError,msg:
print "Error writing %s: %s" % ( filename,msg )
unzipfile(file)
if keep == False:
try:
os.remove(file)
except IOError,msg:
print "%s, %s" % ( Error,msg )
else:
print "Removed %s" % ( os.path.basename(file) )
return 0
if __name__ == "__main__":
sys.exit(main())