Jump to content

User:John Vandenberg/pagefilters.py

From Meta, a Wikimedia project coordination wiki
# -*- coding: utf-8  -*-
This module offers a wide variety of page filters, usually used to
reduce a set of pages.

In general, there is no need to run this script directly. It can, however,
be run for testing purposes.   When run as a standalone utility 
it requires a page generator is specified in addition to a filter.
It will then print the page titles to standard output.

These parameters are supported to specify which pages titles to print:

__version__='$Id: $'

parameterHelp = """\
-nobluelinks      Removes pages that exist.


docuReplacements = {
    '&params;': parameterHelp

# Standard library imports
import re, codecs, sys
import threading, Queue
import urllib, urllib2, time

# Application specific imports
import wikipedia, date, catlib
import config

def BlueLinkFilter(generator):
    Removes pages that exist.
    for page in gen:
        if not page.exists():
            yield page
	    print "skipping %s" % page.title()

def DuplicatePageFilter(generator):
    Wraps around another generator. Yields all pages, but prevents
    seenPages = []
    for page in generator:
        if page not in seenPages:
            yield page

class RegexPageFilter:
    Wraps around another generator. Yields only thos pages, which titles are positively
    matched to regex.
    def __init__(self, regex):
        self.reg = re.compile(regex, re.I)

    def __call__(self, *args):
        for page in args[0]:
	    if self.reg.match(page.titleWithoutNamespace()):
                yield page

def PageWithTalkPageFilter(generator):
    Wraps around another generator. Yields the same pages, but for non-talk pages, it
    also includes associated talk pages.
    This generator does not check if the talk page in fact exists.
    for page in generator:
        yield page
        if not page.isTalkPage():
            yield page.toggleTalkPage()

class FilterFactory:
    This factory is responsible for processing command line arguments
    that are used by many scripts and that determine on which pages
    to work on.
    def __init__(self):

    def handleArg(self, arg):
        filter = None
        if arg.startswith('-nobluelinks'):
            filter = BlueLinkFilter
        elif arg.startswith('-regex:'):
            filter = RegexPageFilter(arg[7:])

        return filter

if __name__ == "__main__":
    import pagegenerators
        gen = None
	fil = None
        genFactory = pagegenerators.GeneratorFactory()
        filterFactory = FilterFactory()
        for arg in wikipedia.handleArgs():
            filter = filterFactory.handleArg(arg)
            if filter:
                fil = filter
                generator = genFactory.handleArg(arg)
                if generator:
                    gen = generator

        if fil and gen:
            for page in fil(gen):
                wikipedia.output(page.title(), toStdout = True)