User:John Vandenberg/pywikipedia-unusedfiles.diff

From Meta, a Wikimedia project coordination wiki
Index: wikipedia.py
===================================================================
--- wikipedia.py        (revision 4195)
+++ wikipedia.py        (working copy)
@@ -3732,21 +3732,29 @@
             if not repeat:
                 break
 
-    def unusedfiles(self, number = 10, repeat = False):
+    def unusedfiles(self, number = 10, repeat = False, extension = None):
         throttle = True
         seen = set()
         while True:
             path = self.unusedfiles_address(n=number)
             get_throttle()
             html = self.getUrl(path)
-            entryR = re.compile('<li>\(<a href=".+?" title="(?P<title>.+?)">.+?</a>\) ')
+            entryR = re.compile('<a href=".+?" title="(?P<title>Image:.+?)">.+?</a>')
             for m in entryR.finditer(html):
+                fileext = None
                 title = m.group('title')
+                if extension:
+                    fileext = title[len(title)-3:]
 
-                if title not in seen:
-                    seen.add(title)
-                    page = ImagePage(self, title)
-                    yield page
+                if title not in seen and fileext == extension:
+                    # Check whether the media is used in a Proofread page
+                    basename = title[6:]
+                    page = Page(self, 'Page:' + basename)
+
+                    if not page.exists():
+                        seen.add(title)
+                        image = ImagePage(self, title)
+                        yield image
             if not repeat:
                 break
 
Index: pagegenerators.py
===================================================================
--- pagegenerators.py   (revision 4195)
+++ pagegenerators.py   (working copy)
@@ -118,10 +118,10 @@
     for page in pageWithImages.imagelinks(followRedirects = False, loose = True):
         yield page
 
-def UnusedFilesGenerator(number = 100, repeat = False, site = None):
+def UnusedFilesGenerator(number = 100, repeat = False, site = None, extension = None):
     if site is None:
         site = wikipedia.getSite()
-    for page in site.unusedfiles(number=number, repeat=repeat):
+    for page in site.unusedfiles(number=number, repeat=repeat, extension=extension):
         yield wikipedia.ImagePage(page.site(), page.title())
 
 def WithoutInterwikiPageGenerator(number = 100, repeat = False, site = None):