Fundraising 2012/FundStatScraper.py
Jump to navigation
Jump to search
Python 2.7 script for extracting tabular data from the Fundraising Statistics page.
May save data out of order.
#!/usr/bin/python
"""Wikimedia Foundation Fundraiser Statistics Scraper
This script will load and scrape data from http://wikimediafoundation.org/wiki/Special:FundraiserStatistics
and place it into a CSV file. It takes one argument; the CSV file name.
"""
import sys
import urllib2
import xml.dom.minidom
if len(sys.argv) != 2:
print("This script needs 1 argument: A path to a writable CSV file. If the file does not exist it will be created.")
exit()
def floatC(s):
return float(str(s).replace(",", ""))
print "Obtaining page."
data = urllib2.urlopen(urllib2.Request(
'http://wikimediafoundation.org/wiki/Special:FundraiserStatistics',
headers={'User-Agent':'FundStatScrapeBot'}
))
print "Parsing DOM."
dom = xml.dom.minidom.parseString(data.read())
print "Opening output file."
out = file(sys.argv[1], 'w')
out.write("date, dayTotal, contributions, avg, max, cumTotal\n")
print "Iterating DOM."
for div in dom.getElementsByTagName('div'):
if div.getAttribute('class') == 'fundraiserstats-view-box':
date = div.firstChild.firstChild.firstChild.firstChild.firstChild.nodeValue
r2 = div.firstChild.firstChild.nextSibling
r3 = r2.nextSibling
dayTotal = floatC(r2.firstChild.nextSibling.firstChild.nodeValue)
contributions = floatC(r2.firstChild.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)
avg = floatC(r2.firstChild.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)
max = floatC(r3.firstChild.nextSibling.firstChild.nodeValue)
cumTotal = floatC(r3.firstChild.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)
out.write("%s, %0.2f, %d, %0.2f, %0.2f, %0.2f\n" % (date, dayTotal, contributions, avg, max, cumTotal))
else:
continue
out.flush()
out.close()
print "Done."