User:John Vandenberg/pagefilters.py
Appearance
#!/usr/bin/python # -*- coding: utf-8 -*- """ This module offers a wide variety of page filters, usually used to reduce a set of pages. In general, there is no need to run this script directly. It can, however, be run for testing purposes. When run as a standalone utility it requires a page generator is specified in addition to a filter. It will then print the page titles to standard output. These parameters are supported to specify which pages titles to print: ¶ms; """ __version__='$Id: $' parameterHelp = """\ -nobluelinks Removes pages that exist. """ docuReplacements = { '¶ms;': parameterHelp } # Standard library imports import re, codecs, sys import threading, Queue import urllib, urllib2, time # Application specific imports import wikipedia, date, catlib import config def BlueLinkFilter(generator): """ Removes pages that exist. """ for page in gen: if not page.exists(): yield page else: print "skipping %s" % page.title() def DuplicatePageFilter(generator): """ Wraps around another generator. Yields all pages, but prevents duplicates. """ seenPages = [] for page in generator: if page not in seenPages: seenPages.append(page) yield page class RegexPageFilter: """ Wraps around another generator. Yields only thos pages, which titles are positively matched to regex. """ def __init__(self, regex): self.reg = re.compile(regex, re.I) pass def __call__(self, *args): for page in args[0]: if self.reg.match(page.titleWithoutNamespace()): yield page def PageWithTalkPageFilter(generator): """ Wraps around another generator. Yields the same pages, but for non-talk pages, it also includes associated talk pages. This generator does not check if the talk page in fact exists. """ for page in generator: yield page if not page.isTalkPage(): yield page.toggleTalkPage() class FilterFactory: """ This factory is responsible for processing command line arguments that are used by many scripts and that determine on which pages to work on. """ def __init__(self): pass def handleArg(self, arg): filter = None if arg.startswith('-nobluelinks'): filter = BlueLinkFilter elif arg.startswith('-regex:'): filter = RegexPageFilter(arg[7:]) return filter if __name__ == "__main__": import pagegenerators try: gen = None fil = None genFactory = pagegenerators.GeneratorFactory() filterFactory = FilterFactory() for arg in wikipedia.handleArgs(): filter = filterFactory.handleArg(arg) if filter: fil = filter else: generator = genFactory.handleArg(arg) if generator: gen = generator if fil and gen: for page in fil(gen): wikipedia.output(page.title(), toStdout = True) else: wikipedia.showHelp('pagefilters') finally: wikipedia.stopme()