#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This module offers a wide variety of page filters, usually used to
reduce a set of pages.
In general, there is no need to run this script directly. It can, however,
be run for testing purposes. When run as a standalone utility
it requires a page generator is specified in addition to a filter.
It will then print the page titles to standard output.
These parameters are supported to specify which pages titles to print:
¶ms;
"""
__version__='$Id: $'
parameterHelp = """\
-nobluelinks Removes pages that exist.
"""
docuReplacements = {
'¶ms;': parameterHelp
}
# Standard library imports
import re, codecs, sys
import threading, Queue
import urllib, urllib2, time
# Application specific imports
import wikipedia, date, catlib
import config
def BlueLinkFilter(generator):
"""
Removes pages that exist.
"""
for page in gen:
if not page.exists():
yield page
else:
print "skipping %s" % page.title()
def DuplicatePageFilter(generator):
"""
Wraps around another generator. Yields all pages, but prevents
duplicates.
"""
seenPages = []
for page in generator:
if page not in seenPages:
seenPages.append(page)
yield page
class RegexPageFilter:
"""
Wraps around another generator. Yields only thos pages, which titles are positively
matched to regex.
"""
def __init__(self, regex):
self.reg = re.compile(regex, re.I)
pass
def __call__(self, *args):
for page in args[0]:
if self.reg.match(page.titleWithoutNamespace()):
yield page
def PageWithTalkPageFilter(generator):
"""
Wraps around another generator. Yields the same pages, but for non-talk pages, it
also includes associated talk pages.
This generator does not check if the talk page in fact exists.
"""
for page in generator:
yield page
if not page.isTalkPage():
yield page.toggleTalkPage()
class FilterFactory:
"""
This factory is responsible for processing command line arguments
that are used by many scripts and that determine on which pages
to work on.
"""
def __init__(self):
pass
def handleArg(self, arg):
filter = None
if arg.startswith('-nobluelinks'):
filter = BlueLinkFilter
elif arg.startswith('-regex:'):
filter = RegexPageFilter(arg[7:])
return filter
if __name__ == "__main__":
import pagegenerators
try:
gen = None
fil = None
genFactory = pagegenerators.GeneratorFactory()
filterFactory = FilterFactory()
for arg in wikipedia.handleArgs():
filter = filterFactory.handleArg(arg)
if filter:
fil = filter
else:
generator = genFactory.handleArg(arg)
if generator:
gen = generator
if fil and gen:
for page in fil(gen):
wikipedia.output(page.title(), toStdout = True)
else:
wikipedia.showHelp('pagefilters')
finally:
wikipedia.stopme()