Fundraising 2012/

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search

Python 2.7 script for extracting tabular data from the Fundraising Statistics page.

May save data out of order.

"""Wikimedia Foundation Fundraiser Statistics Scraper

This script will load and scrape data from
and place it into a CSV file. It takes one argument; the CSV file name.

import sys
import urllib2
import xml.dom.minidom

if len(sys.argv) != 2:
    print("This script needs 1 argument: A path to a writable CSV file. If the file does not exist it will be created.")

def floatC(s):
    return float(str(s).replace(",", ""))

print "Obtaining page."
data = urllib2.urlopen(urllib2.Request(

print "Parsing DOM."
dom = xml.dom.minidom.parseString(

print "Opening output file."
out = file(sys.argv[1], 'w')
out.write("date, dayTotal, contributions, avg, max, cumTotal\n")

print "Iterating DOM."
for div in dom.getElementsByTagName('div'):
    if div.getAttribute('class') == 'fundraiserstats-view-box':
        date = div.firstChild.firstChild.firstChild.firstChild.firstChild.nodeValue
        r2 = div.firstChild.firstChild.nextSibling
        r3 = r2.nextSibling

        dayTotal = floatC(r2.firstChild.nextSibling.firstChild.nodeValue)
        contributions = floatC(r2.firstChild.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)
        avg = floatC(r2.firstChild.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)

        max = floatC(r3.firstChild.nextSibling.firstChild.nodeValue)
        cumTotal = floatC(r3.firstChild.nextSibling.nextSibling.nextSibling.firstChild.nodeValue)

        out.write("%s, %0.2f, %d, %0.2f, %0.2f, %0.2f\n" % (date, dayTotal, contributions, avg, max, cumTotal))


print "Done."