User:Ericj/MedCab Bot

This is a hacked-together bot that depends on the pywikipediabot framework and updates a list from a particular category, replacing a placeholder region with the generated list.
It also reads the article, parses a template, and prints the parsed parameters on the list page.
This bot is used with only minor configuration changes (the target page) to handle regular maintainance work for Wikipedia's Mediation Cabal.
#!/usr/local/bin/python
# -*- coding: utf-8 -*-


"""
Wikipedia:Mediation Cabal - Mediation listing automation bot.

This bot will grab all the new cases from the specified
category and add them to the specified page replacing the
region specified automatically.

It will also make a list of the open cases in the same
manner.  Set variables accordingly.
"""

import re, sys, string
sys.path.append('/home/misza13/pywikipedia')
import wikipedia, catlib, config

#**************
#* Variables: *
#**************

# Description of 'terms' in the new and open dict()s:
# category - Category to draw list of new cases from.
# target - Target page to add list of new cases to.
# section -
#   Section of target page to replace, which is delimited
#   by <!-- BEGIN [section] --> <!-- END [section] -->
# titlemask -
#   The title mask removes some portion of the title from
#   the link list.
# exclude -
#   Page exclusion regex.  List the pages that you don't
#   want to have included in the output list.
#   Example:
#   Template\:Medcab2$|Wikipedia\:Mediation Cabal\/Complaints$
# action - Update action text.

# New cases
new = dict()
new['category'] = 'Wikipedia_Medcab_new_cases'
new['target'] = 'Wikipedia:Mediation Cabal/Cases'
new['section'] = 'NewCases'
new['titlemask'] = r"^Wikipedia\:Mediation Cabal\/Cases\/"
new['exclude'] = r"^Template\:.*|^User:.*"
new['action'] = "Updating new cases"

# Open cases
open = dict()
open['category'] = 'Wikipedia_Medcab_open_cases'
open['target'] = 'Wikipedia:Mediation Cabal/Cases'
open['section'] = 'OpenCases'
open['titlemask'] = r"^Wikipedia\:Mediation Cabal\/Cases\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "Updating open cases"

# Description of 'terms' in the status dict():
# tmpl - Name of the status template.

status = dict()
status['tmpl'] = r'Medcabstatus'

# This should be run infrequently, using relatively short delays for the processing.
wikipedia.get_throttle.setDelay(5) # 5 seconds
wikipedia.put_throttle.setDelay(5) # 5 seconds

# *******************
# * MedCabBot Class *
# *******************
class MedCabBot:
  def __init__(self):
    pass

  def run(self):
    if new['target'] == open['target']:

      wikipedia.output(u'Processing Cases Lists')
      page_target = wikipedia.Page(wikipedia.getSite(), new['target'])
      page_data = page_target.get()
      new_page_data = self.process_category(new, page_data)
      open_page_data = self.process_category(open, new_page_data)

      # Check if the page has changed at all.
      if new_page_data != page_data or open_page_data != new_page_data:
        # If it has, update.
        action = u""
        if page_data != new_page_data:
          if new_page_data != open_page_data:
            action = new['action'] + ' & ' + open['action']
          else:
            action = new['action']
        else:
          action = open['action']
        wikipedia.output(u'Updating Cases Lists')
        wikipedia.setAction(action)
        page_target.put(open_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'Cases Lists are already up-to-date')

    else:

      wikipedia.output(u'Processing New Cases List')
      page_target = wikipedia.Page(wikipedia.getSite(), new['target'])
      page_data = page_target.get()
      new_page_data = self.process_category(new, page_data)
      if new_page_data != page_data:
        wikipedia.output(u'Updating New Cases List')
        wikipedia.setAction(new['action'])
        page_target.put(new_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'New Cases List is already up-to-date')

      wikipedia.output(u'Processing Open Cases List')
      page_target = wikipedia.Page(wikipedia.getSite(), open['target'])
      page_data = page_target.get()
      open_page_data = self.process_category(open, page_data)
      if open_page_data != page_data:
        wikipedia.output(u'Updating Open Cases List')
        wikipedia.setAction(open['action'])
        page_target.put(open_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'Open Cases List is already up-to-date')

  def process_category(self, pgt, page_data):
    # Populate local variables
    category = pgt['category']
    section = pgt['section']
    titlemask = pgt['titlemask']
    exclude = pgt['exclude']

    # Setup Regular Expressions used later.
    exclude_regex = re.compile(exclude)
    titlemask_regex = re.compile(titlemask)

    # Create instance of catlib object and specify category.
    cat = catlib.Category(wikipedia.getSite(), 'Category:' + category)

    # Get array of pages in category.
    pages = cat.articles()
    #pages.reverse() # Change to descending date order

    # Initialize variables.
    total = 2
    count = 0

    # Check if there are any pages in the category.
    if total == 0:
      # If the number of pages is zero output status.
      wikipedia.output('Category:' + category + ' is empty, doing nothing.')
      return page_data
    else:
      # Initialize variables.
      pagelist = u"\n"

      # Otherwise, process the pages to produce a page.
      wikipedia.output(u'Now processing ' + str(total) + ' mediation request pages.')

      # Loop through all pages.
      for page in pages:
        title = page.title()
        count = count + 1
        # Check to see whether it's in the exclude list.
        if exclude_regex.match(title):
          wikipedia.output(str(count) + u' of ' + str(total) + ' ' + title + ' - Skipping')
        else:
          # If not in the exclude list, add to the pagelist.
          # Output status line.
          wikipedia.output(str(count) + u' of ' + str(total) + ' ' + title)
          # Add the page title to the page list.
          pagelist = pagelist + u'* [[' + title + '|' + titlemask_regex.sub('', title) + ']]'

          hist = page.getVersionHistory()
          print hist

          # Get dict of parameters from status template.
          params = self.get_tmpl_params(page, status['tmpl'])
          if params:
            if 'mediators' in params and params['mediators'] != '':
              pagelist = pagelist + u' — Mediator(s): ' + params['mediators'] + "\n"
            else:
              pagelist = pagelist + u"\n"
            if 'comment' in params:
              if params['comment'] != '':
                pagelist = pagelist + u'** Comment: ' + params['comment'] + "\n"
          else:
            pagelist = pagelist + u"\n"

      # Finish the formatting of the pagelist.
      pagelist = u'<!-- BEGIN ' + section + ' -->' + pagelist + '<!-- END ' + section + ' -->'

      # Setup regex to find replaced region.
      start = r'\<\!\-\- BEGIN ' + section + ' \-\-\>'
      end = r'\<\!\-\- END ' + section + ' \-\-\>'

      # Run replacement and place in new variable.
      return re.compile(start + r'.*?' + end, re.S).sub(pagelist, page_data)

  # WARNING: get_tmpl_params() is really scary.
  # If there's an efficient regex for parsing out templates, I'd love to have it.
  def get_tmpl_params(self, page, tmpl_name):
    # Compile regexes.
    tmpl_open = re.compile(r'\{\{', re.I | re.S)
    tmpl_close = re.compile(r'\}\}', re.I | re.S)

    # Get the case page data.
    page_data = page.get()
    # Strip Comments
    page_data = re.sub(r'\<\!\-\-.*?\-\-\>', '', page_data)

    # Find start of string.
    m = re.compile(r'\{\{' + tmpl_name + '\W*?\|', re.I | re.S).search(page_data)

    # Only do processing if the search was successful.
    if m:
      # Set the start point for the parameter list.
      param_start = m.end()
      # Set the end point for the parameter list, which will iterate up if
      # subtemplates are found within the template definition.
      param_end = tmpl_close.search(page_data, param_start).end()
      # Set the temporary search results variable for the next template
      # opening delimiter.
      m = tmpl_open.search(page_data, param_start)
      # Since this could fail, verify that this result can be compared.
      if m:
        # Set the param_open variable to the last found template
        # opening delimiter.
        param_open = m.end()
        # While the end point for the parameter range is greater than
        # the end point of the last search for a template opening
        # delimiter we know that there is a subtemplate to identify.
        # This assumes that the templates are properly nested.
        while param_open < param_end:
          # Set the temporary search results variable to the next
          # template opening delimiter.
          m = tmpl_open.search(page_data, param_end)
          # Logic to set the param_open variable.
          if m:
            # Search was successful, set to end() value.
            param_open = m.end()
          else:
            # Search was failure, exit loop.
            param_end = tmpl_close.search(page_data, param_end).end()
            break
          # Sets the new end point for the parameter range.
          param_end = tmpl_close.search(page_data, param_end).end()

      # Remove the closing template delimiter.
      param_end = param_end - 2

      # Declare parameters dict()
      params = dict()
      # Loop through each parameter.
      for param in re.split(r"\n[\|]*",page_data[param_start:param_end]):
        # Only try splitting and adding to the params if not blank.
        if param != '':
          # Split only on the first equal sign.
          temp = param.split('=', 1)
          # Add entry for this parameter.
          if len(temp) > 1:
            params[temp[0].strip()] = temp[1].strip()

      # Debugging output.
      print params

      # Return the dict()
      return params

    else:
      # Failed, return the results of the failed match()
      return m

if __name__ == "__main__":
  try:
    bot = MedCabBot()
    bot.run()
  finally:
    wikipedia.stopme()