List of Wikipedias by sample of articles/Source code

From Meta, a Wikimedia project coordination wiki
Jump to: navigation, search

Here is Smeira's original source code but its out-of-date. The last updates has used the source code below. There are three files. The first is the actual program. The second is a utility to generate the list of articles. The third is a utility to generate the list of previous scores to calculate growth.

MakeScoreTable.py[edit]

# -*- coding: utf_8 -*-
import sys
sys.path.append('./pywikipedia')
 
import wikipedia
import pywikibot
import pagegenerators
import catlib
import traceback
import os
import re
from time import strftime
 
#language information
lang_info ={'en': {'name':u'English',     'localname':u'English',  'weight': 1.0},
            'de': {'name':u'German',      'localname':u'Deutsch',  'weight':1.0},
            'fr': {'name':u'French',      'localname':u'Français', 'weight':1.0},
            'pl': {'name':u'Polish',      'localname':u'Polski',   'weight':1.1},
            'ja': {'name':u'Japanese',    'localname':u'日本語',    'weight':1.9},
            'it': {'name':u'Italian',     'localname':u'Italiano', 'weight':1.1},
            'nl': {'name':u'Dutch',       'localname':u'Nederlands', 'weight':0.9},
            'pt': {'name':u'Portuguese',  'localname':u'Português', 'weight':1.1},
            'es': {'name':u'Spanish',     'localname':u'Español',  'weight':1.1},
            'sv': {'name':u'Swedish',     'localname':u'Svenska',  'weight':1.1},
            'ru': {'name':u'Russian',     'localname':u'Русский',  'weight':1.4},
            'zh': {'name':u'Chinese',     'localname':u'中文',      'weight':3.7},
            'no': {'name':u'Norwegian (Bokmål)','localname':u'Norsk (Bokmål)', 'weight':1.2},
            'fi': {'name':u'Finnish',     'localname':u'Suomi', 'weight':1.1},
            'vo': {'name':u'Volapük',     'localname':u'Volapük'},
            'ca': {'name':u'Catalan',     'localname':u'Català', 'weight':1.1},
            'ro': {'name':u'Romanian',    'localname':u'Română', 'weight':1.1},
            'tr': {'name':u'Turkish',     'localname':u'Türkçe', 'weight':1.3},
            'uk': {'name':u'Ukrainian',   'localname':u'Українська', 'weight':1.3},
            'eo': {'name':u'Esperanto',   'localname':u'Esperanto', 'weight':1.1},
            'cs': {'name':u'Czech',       'localname':u'Čeština', 'weight':1.3},
            'hu': {'name':u'Hungarian',   'localname':u'Magyar', 'weight':1.1},
            'sk': {'name':u'Slovak',      'localname':u'Slovenčina', 'weight':1.3},
            'da': {'name':u'Danish',      'localname':u'Dansk', 'weight':1.2},
            'id': {'name':u'Indonesian',  'localname':u'Bahasa Indonesia', 'weight':0.9},
            'he': {'name':u'Hebrew',      'localname':u'עברית', 'weight':1.2},
            'lt': {'name':u'Lithuanian',  'localname':u'Lietuvių', 'weight':1.2},
            'sr': {'name':u'Serbian',     'localname':u'Српски / Srpski', 'weight':1.4},
            'sl': {'name':u'Slovenian',   'localname':u'Slovenščina', 'weight':1.2},
            'ko': {'name':u'Korean',      'localname':u'한국어', 'weight':2.5},
            'ar': {'name':u'Arabic',      'localname':u'العربية', 'weight':1.0},
            'bg': {'name':u'Bulgarian',   'localname':u'Български', 'weight':1.1},
            'et': {'name':u'Estonian',    'localname':u'Eesti', 'weight':1.2},
            'hr': {'name':u'Croatian',    'localname':u'Hrvatski', 'weight':1.3},
            'new':{'name':u'Newar / Nepal Bhasa','localname':u'नेपाल भाषा'},
            'te': {'name':u'Telugu',      'localname':u'తెలుగు'},
            'vi': {'name':u'Vietnamese',  'localname':u'Tiếng Việt', 'weight':1.1},
            'th': {'name':u'Thai',        'localname':u'ไทย', 'weight':1.0},
            'gl': {'name':u'Galician',    'localname':u'Galego', 'weight':1.2},
            'fa': {'name':u'Persian',     'localname':u'فارسی', 'weight':1.2},
            'nn': {'name':u'Norwegian (Nynorsk)','localname':u'Nynorsk', 'similar_lang':'no'},
            'ceb':{'name':u'Cebuano',     'localname':u'Sinugboanong Binisaya', 'weight':0.8},
            'el': {'name':u'Greek',       'localname':u'Ελληνικά', 'weight':1.1},
            'ms': {'name':u'Malay',       'localname':u'Bahasa Melayu', 'weight':1.0},
            'simple':{'name':u'Simple English','localname':u'Simple English'},
            'eu': {'name':u'Basque',      'localname':u'Euskara', 'weight':1.1},
            'bpy':{'name':u'Bishnupriya Manipuri','localname':u'ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী'},
            'bs': {'name':u'Bosnian',     'localname':u'Bosanski', 'similar_lang':'hr'},
            'lb': {'name':u'Luxembourgish','localname':u'Lëtzebuergesch'},
            'is': {'name':u'Icelandic',   'localname':u'Íslenska', 'weight':1.1},
            'ka': {'name':u'Georgian',    'localname':u'ქართული'},
            'sq': {'name':u'Albanian',    'localname':u'Shqip'},
            'la': {'name':u'Latin',       'localname':u'Latina', 'weight':1.1},
            'br': {'name':u'Breton',      'localname':u'Brezhoneg'},
            'az': {'name':u'Azeri',       'localname':u'Azərbaycan', 'weight':1.2},
            'hi': {'name':u'Hindi',       'localname':u'हिन्दी', 'weight':1.0},
            'bn': {'name':u'Bengali',     'localname':u'বাংলা'},
            'ht': {'name':u'Haitian',     'localname':u'Krèyol ayisyen'},
            'mk': {'name':u'Macedonian',  'localname':u'Македонски', 'weight':1.3},
            'mr': {'name':u'Marathi',     'localname':u'मराठी'},
            'sh': {'name':u'Serbo-Croatian','localname':u'Srpskohrvatski / Српскохрватски', 'similar_lang':'hr'},
            'tl': {'name':u'Tagalog',     'localname':u'Tagalog'},
            'io': {'name':u'Ido',         'localname':u'Ido'},
            'cy': {'name':u'Welsh',       'localname':u'Cymraeg', 'weight':1.2},
            'pms':{'name':u'Piedmontese', 'localname':u'Piemontèis'},
            'lv': {'name':u'Latvian',     'localname':u'Latviešu', 'weight':1.1},
            'su': {'name':u'Sundanese',   'localname':u'Basa Sunda'},
            'ta': {'name':u'Tamil',       'localname':u'தமிழ்', 'weight':0.9},
            'jv': {'name':u'Javanese',    'localname':u'Basa Jawa'},
            'nap':{'name':u'Neapolitan',  'localname':u'Nnapulitano'},
            'oc': {'name':u'Occitan',     'localname':u'Occitan'},
            'nds':{'name':u'Low Saxon',   'localname':u'Plattdüütsch'},
            'scn':{'name':u'Sicilian',    'localname':u'Sicilianu'},
            'ast':{'name':u'Asturian',    'localname':u'Asturianu'},
            'ku': {'name':u'Kurdish',     'localname':u'Kurdî / كوردی'},
            'be': {'name':u'Belarusian',  'localname':u'Беларуская', 'similar_lang':'be-x-old'},
            'wa': {'name':u'Walloon',     'localname':u'Walon'},
            'af': {'name':u'Afrikaans',   'localname':u'Afrikaans', 'weight':1.0},
            'be-x-old':{'name':u'Belarusian (Taraškievica)','localname':u'Беларуская (тарашкевіца)', 'weight':1.4},
            'tg': {'name':u'Tajik',       'localname':u'Тоҷикӣ'},
            'an': {'name':u'Aragonese',   'localname':u'Aragonés', 'weight':1.1},
            'fy': {'name':u'West Frisian','localname':u'Frysk'},
            'vec':{'name':u'Venetian',    'localname':u'Vèneto'},
            'roa-tara':{'name':u'Tarantino',   'localname':u'Tarandíne'},
            'cv': {'name':u'Chuvash',     'localname':u'Чăваш'},
            'zh-yue':{'name':u'Cantonese',   'localname':u'粵語', 'similar_lang':'zh'},
            'ur': {'name':u'Urdu',        'localname':u'اردو'},
            'ksh':{'name':u'Ripuarian',   'localname':u'Ripoarisch'},
            'sw': {'name':u'Swahili',     'localname':u'Kiswahili'},
            'qu': {'name':u'Quechua',     'localname':u'Runa Simi'},
            'uz': {'name':u'Uzbek',       'localname':u'O‘zbek'},
            'mi': {'name':u'Maori',       'localname':u'Māori'},
            'ga': {'name':u'Irish',       'localname':u'Gaeilge'},
            'bat-smg':{'name':u'Samogitian',  'localname':u'Žemaitėška'},
            'ml': {'name':u'Malayalam',   'localname':u'മലയാളം', 'weight':1.1},
            'gd': {'name':u'Scottish Gaelic','localname':u'Gàidhlig'},
            'yo': {'name':u'Yoruba',      'localname':u'Yorùbá'},
            'co': {'name':u'Corsican',    'localname':u'Corsu'},
            'kn': {'name':u'Kannada',     'localname':u'ಕನ್ನಡ', 'weight':0.9},
            'pam':{'name':u'Kapampangan', 'localname':u'Kapampangan'},
            'yi': {'name':u'Yiddish',     'localname':u'ייִדיש'},
            'hsb':{'name':u'Upper Sorbian','localname':u'Hornjoserbsce'},
            'nah':{'name':u'Nahuatl',     'localname':u'Nāhuatl'},
            'ia': {'name':u'Interlingua', 'localname':u'Interlingua', 'weight':1.0},
            'li': {'name':u'Limburgian',  'localname':u'Limburgs'},
            'sa': {'name':u'Sanskrit',    'localname':u'संस्कृतम्'},
            'hy': {'name':u'Armenian',    'localname':u'Հայերեն', 'weight':1.2},
            'tt': {'name':u'Tatar',       'localname':u'Tatarça / Татарча'},
            'als':{'name':u'Alemannic',   'localname':u'Alemannisch', 'weight':1.1},
            'roa-rup':{'name':u'Aromanian',   'localname':u'Armãneashce'},
            'lmo':{'name':u'Lombard',     'localname':u'Lumbaart'},
            'map-bms':{'name':u'Banyumasan',  'localname':u'Basa Banyumasan'},
            'am': {'name':u'Amharic',     'localname':u'አማርኛ'},
            'nrm':{'name':u'Norman',      'localname':u'Nouormand/Normaund'},
            'zh-min-nan':{'name':u'Min Nan',     'localname':u'Bân-lâm-gú', 'weight':1.2},
            'pag':{'name':u'Pangasinan',  'localname':u'Pangasinan'},
            'wuu':{'name':u'Wu',          'localname':u'吴语', 'similar_lang':'zh'},
            'fo': {'name':u'Faroese',     'localname':u'Føroyskt'},
            'vls':{'name':u'West Flemish','localname':u'West-Vlams'},
            'nds-nl':{'name':u'Dutch Low Saxon','localname':u'Nedersaksisch'},
            'se': {'name':u'Northern Sami','localname':u'Sámegiella'},
            'rm': {'name':u'Romansh',     'localname':u'Rumantsch'},
            'ne': {'name':u'Nepali',      'localname':u'नेपाली'},
            'war':{'name':u'Waray-Waray', 'localname':u'Winaray'},
            'fur':{'name':u'Friulian',    'localname':u'Furlan'},
            'lij':{'name':u'Ligurian',    'localname':u'Líguru'},
            'nov':{'name':u'Novial',      'localname':u'Novial'},
            'bh': {'name':u'Bihari',      'localname':u'भोजपुरी'},
            'sco':{'name':u'Scots',       'localname':u'Scots'},
            'dv': {'name':u'Divehi',      'localname':u'ދިވެހިބަސް'},
            'pi': {'name':u'Pali',        'localname':u'पाऴि'},
            'diq':{'name':u'Zazaki',      'localname':u'Zazaki'},
            'ilo':{'name':u'Ilokano',     'localname':u'Ilokano'},
            'kk': {'name':u'Kazakh',      'localname':u'Қазақша', 'weight':1.3},
            'os': {'name':u'Ossetian',    'localname':u'Иронау'},
            'frp':{'name':u'Franco-Provençal/Arpitan','localname':u'Arpitan'},
            'zh-classical':{'name':u'Classical Chinese','localname':u'古文 / 文言文', 'similar_lang':'zh'},
            'mt': {'name':u'Maltese',     'localname':u'Malti'},
            'lad':{'name':u'Ladino',      'localname':u'Dzhudezmo'},
            'fiu-vro':{'name':u'Võro',        'localname':u'Võro'},
            'pdc':{'name':u'Pennsylvania German','localname':u'Deitsch'},
            'csb':{'name':u'Kashubian',   'localname':u'Kaszëbsczi'},
            'kw': {'name':u'Cornish',     'localname':u'Kernewek'},
            'bar':{'name':u'Bavarian',    'localname':u'Boarisch'},
            'to': {'name':u'Tongan',      'localname':u'faka Tonga'},
            'haw':{'name':u'Hawaiian',    'localname':u'Hawai`i'},
            'mn': {'name':u'Mongolian',   'localname':u'Монгол'},
            'ps': {'name':u'Pashto',      'localname':u'پښتو'},
            'ang':{'name':u'Anglo-Saxon', 'localname':u'Englisc'},
            'km': {'name':u'Khmer',       'localname':u'ភាសាខ្មែរ'},
            'gv': {'name':u'Manx',        'localname':u'Gaelg'},
            'tk': {'name':u'Turkmen',     'localname':u'تركمن / Туркмен'},
            'ln': {'name':u'Lingala',     'localname':u'Lingala'},
            'ie': {'name':u'Interlingue', 'localname':u'Interlingue'},
            'tpi':{'name':u'Tok Pisin',   'localname':u'Tok Pisin'},
            'crh':{'name':u'Crimean Tatar','localname':u'Qırımtatarca'},
            'jbo':{'name':u'Lojban',      'localname':u'Lojban', 'weight':1.2},
            'wo': {'name':u'Wolof',       'localname':u'Wolof'},
            'ay': {'name':u'Aymara',      'localname':u'Aymar'},
            'zea':{'name':u'Zealandic',   'localname':u'Zeêuws'},
            'eml':{'name':u'Emilian-Romagnol','localname':u'Emiliàn e rumagnòl'},
            'si': {'name':u'Sinhalese',   'localname':u'සිංහල'},
            'sc': {'name':u'Sardinian',   'localname':u'Sardu'},
            'or': {'name':u'Oriya',       'localname':u'ଓଡ଼ିଆ'},
            'ig': {'name':u'Igbo',        'localname':u'Igbo'},
            'mg': {'name':u'Malagasy',    'localname':u'Malagasy'},
            'cbk-zam':{'name':u'Zamboanga Chavacano','localname':u'Chavacano de Zamboanga'},
            'gu': {'name':u'Gujarati',    'localname':u'ગુજરાતી'},
            'ky': {'name':u'Kirghiz',     'localname':u'Кыргызча'},
            'kg': {'name':u'Kongo',       'localname':u'KiKongo'},
            'ty': {'name':u'Tahitian',    'localname':u'Reo Mā`ohi'},
            'glk':{'name':u'Gilaki',      'localname':u'گیلکی'},
            'arc':{'name':u'Assyrian Neo-Aramaic','localname':u'ܐܪܡܝܐ'},
            'mo': {'name':u'Moldovan',    'localname':u'Молдовеняскэ'},
            'gn': {'name':u'Guarani',     'localname':u'Avañe\'ẽ'},
            'kab':{'name':u'Kabyle',      'localname':u'Taqbaylit'},
            'so': {'name':u'Somali',      'localname':u'Soomaaliga'},
            'ks': {'name':u'Kashmiri',    'localname':u'कश्मीरी / كشميري'},
            'stq':{'name':u'Saterland Frisian','localname':u'Seeltersk'},
            'mzn':{'name':u'Mazandarani', 'localname':u'مَزِروني'},
            'cu': {'name':u'Old Church Slavonic','localname':u'Словѣньскъ'},
            'ce': {'name':u'Chechen',     'localname':u'Нохчийн'},
            'udm':{'name':u'Udmurt',      'localname':u'Удмурт кыл'},
            'tet':{'name':u'Tetum',       'localname':u'Tetun'},
            'sd': {'name':u'Sindhi',      'localname':u'سنڌي، سندھی ، सिन्ध'},
            'pap':{'name':u'Papiamentu',  'localname':u'Papiamentu'},
            'ba': {'name':u'Bashkir',     'localname':u'Башҡорт'},
            'pa': {'name':u'Punjabi',     'localname':u'ਪੰਜਾਬੀ'},
            'rmy':{'name':u'Romani',      'localname':u'romani - रोमानी'},
            'lo': {'name':u'Lao',         'localname':u'ລາວ'},
            'na': {'name':u'Nauruan',     'localname':u'dorerin Naoero'},
            'bcl':{'name':u'Central Bicolano','localname':u'Bikol'},
            'kaa':{'name':u'Karakalpak',  'localname':u'Qaraqalpaq tili'},
            'gan':{'name':u'Gan',         'localname':u'贛語', 'similar_lang':'zh'},
            'iu': {'name':u'Inuktitut',   'localname':u'ᐃᓄᒃᑎᑐᑦ'},
            'myv':{'name':u'Erzya',       'localname':u'Эрзянь (Erzjanj Kelj)'},
            'szl':{'name':u'Silesian',    'localname':u'Ślůnski'},
            'sah':{'name':u'Sakha',       'localname':u'Саха тыла (Saxa Tyla)'},
            'my': {'name':u'Burmese',     'localname':u'Burmese'},
            'ext':{'name':u'Extremaduran','localname':u'Estremeñu'},
            'hif':{'name':u'Fiji Hindi',  'localname':u'Fiji Hindi'},
            'bo': {'name':u'Tibetan',     'localname':u'བོད་སྐད་'},
            'srn':{'name':u'Sranan',      'localname':u'Sranantongo'},
            'got':{'name':u'Gothic',      'localname':u'ðミフᄇðミフ﾿ðミヘトðミフᄚðミヘツðミフᄚðミフᄊðミフᄈðミフᄚ'},
            'dsb':{'name':u'Lower Sorbian','localname':u'Dolnoserbšćina'},
            'bm': {'name':u'Bambara',     'localname':u'Bamanankan'},
            'sm': {'name':u'Samoan',      'localname':u'Gagana Samoa'},
            'cdo':{'name':u'Min Dong',    'localname':u'Mìng-dĕ̤ng-ngṳ̄'},
            'chr':{'name':u'Cherokee',    'localname':u'ᏣᎳᎩ ᎧᏬᏂᎯᏍᏗ'},
            'mdf':{'name':u'Moksha',      'localname':u'Мокшень (Mokshanj Kälj)'},
            'om': {'name':u'Oromo',       'localname':u'Oromoo'},
            'ee': {'name':u'Ewe',         'localname':u'Eʋegbe'},
            'as': {'name':u'Assamese',    'localname':u'অসমীয়া ভাষা আৰু লিপি'},
            'ti': {'name':u'Tigrinya',    'localname':u'ትግርኛ_ፊደል'},
            'ug': {'name':u'Uyghur',      'localname':u'Oyghurque'},
            'kv': {'name':u'Komi',        'localname':u'Коми'},
            'zu': {'name':u'Zulu',        'localname':u'IsiZulu'},
            'av': {'name':u'Avar',        'localname':u'Авар'},
            'nv': {'name':u'Navajo',      'localname':u'Diné bizaad'},
            'ss': {'name':u'Swati',       'localname':u'SiSwati'},
            'pih':{'name':u'Norfolk',     'localname':u'Norfuk'},
            'ts': {'name':u'Tsonga',      'localname':u'Xitsonga'},
            'cr': {'name':u'Cree',        'localname':u'Nehiyaw'},
            've': {'name':u'Venda',       'localname':u'TshiVenda'},
            'ch': {'name':u'Chamorro',    'localname':u'Chamoru'},
            'bi': {'name':u'Bislama',     'localname':u'Bislama'},
            'xh': {'name':u'Xhosa',       'localname':u'IsiXhosa'},
            'rw': {'name':u'Kinyarwanda', 'localname':u'Kinyarwanda'},
            'dz': {'name':u'Dzongkha',    'localname':u'རྫོང་ཁ་'},
            'tn': {'name':u'Tswana',      'localname':u'Setswana'},
            'kl': {'name':u'Greenlandic', 'localname':u'Kalaallisut'},
            'bug':{'name':u'Buginese',    'localname':u'Basa Ugi'},
            'ik': {'name':u'Inupiak',     'localname':u'Iñupiak uqautchit'},
            'bxr':{'name':u'Buryat (Russia)','localname':u'Буряад'},
            'st': {'name':u'Sesotho',     'localname':u'Sesotho'},
            'xal':{'name':u'Kalmyk',      'localname':u'Хальмг келн'},
            'ny': {'name':u'Chichewa',    'localname':u'Chicheŵa'},
            'ak': {'name':u'Akan',        'localname':u'Akana'},
            'ab': {'name':u'Abkhazian',   'localname':u'Аҧсуа бызшәа'},
            'fj': {'name':u'Fijian',      'localname':u'Na Vosa Vakaviti'},
            'lg': {'name':u'Luganda',     'localname':u'Luganda'},
            'tw': {'name':u'Twi',         'localname':u'Twi'},
            'ha': {'name':u'Hausa',       'localname':u'هَوُسَ'},
            'za': {'name':u'Zhuang',      'localname':u'Sawcuengh'},
            'ff': {'name':u'Fula',        'localname':u'Fulfulde'},
            'lbe':{'name':u'Lak',         'localname':u'Лакку маз'},
            'ki': {'name':u'Kikuyu',      'localname':u'Gĩgĩkũyũ'},
            'sn': {'name':u'Shona',       'localname':u'ChiShona'},
            'tum':{'name':u'Tumbuka',     'localname':u'ChiTumbuka'},
            'sg': {'name':u'Sango',       'localname':u'Sängö'},
            'ii': {'name':u'Sichuan Yi',  'localname':u'ꆇꉙ'},
            'chy':{'name':u'Cheyenne',    'localname':u'Tsetsêhestâhese'},
            'rn': {'name':u'Kirundi',     'localname':u'Kirundi'},
            'cho':{'name':u'Choctaw',     'localname':u'Chahta Anumpa'},
            'mh': {'name':u'Marshallese', 'localname':u'Kajin M̧ajeļ'},
            'aa': {'name':u'Afar',        'localname':u'Afar'},
            'ng': {'name':u'Ndonga',      'localname':u'Oshiwambo'},
            'kj': {'name':u'Kuanyama',    'localname':u'Kuanyama'},
            'ho': {'name':u'Hiri Motu',   'localname':u'Hiri Motu'},
            'mus':{'name':u'Muscogee',    'localname':u'Muskogee'},
            'kr': {'name':u'Kanuri',      'localname':u' Kanuri'},
            'hz': {'name':u'Herero',      'localname':u'Otsiherero'},
      #     'tokipona':{'name':u'Tokipona',    'localname':u'Tokipona'},       
            'arz':{'name':u'Egyptian Arabic', 'localname':u'مصرى (Maṣrī)', 'similar_lang':'ar'},
            'pnt':{'name':u'Pontic',          'localname':u'Ποντιακά', 'similar_lang':'el'},
            'mhr':{'name':u'Meadow Mari',     'localname':u'Олык Марий'},
            'ace':{'name':u'Acehnese',        'localname':u'Acèh'},
            'ckb':{'name':u'Soranî',          'localname':u'Soranî / کوردی'},
            'mwl':{'name':u'Mirandese',       'localname':u'Mirandés'},
            'pnb':{'name':u'Western Panjabi', 'localname':u'پنجابی'},
            'pcd':{'name':u'Picard',          'localname':u'Picard'},
            'krc':{'name':u'Karachay-Balkar', 'localname':u'Къарачай-Малкъар'},
            'frr':{'name':u'North Frisian',   'localname':u'Nordfriisk'},
            'bjn':{'name':u'Banjar',          'localname':u'Bahasa Banjar'},
            'mrj':{'name':u'Hill Mari',       'localname':u'Кырык Мары (Kyryk Mary)'},
            'koi':{'name':u'Komi-Permyak',    'localname':u'Перем Коми (Perem Komi)'},
            'gag':{'name':u'Gagauz',           'localname':u'Gagauz'},
            'pfl':{'name':u'Palatinate German','localname':u'Pfälzisch'},
            'rue':{'name':u'Rusyn',            'localname':u'русиньскый язык'},
            'ltg':{'name':u'Latgalian',        'localname':u'Latgaļu volūda'},
            'kbd':{'name':u'Kabardian',        'localname':u'Aдыгэбзэ'},
            'xmf':{'name':u'Mingrelian',       'localname':u'მარგალური'},
            'nso':{'name':u'Northern Sotho',   'localname':u'Sesotho sa Leboa'},
            'vep':{'name':u'Veps',             'localname':u'Vepsän kel\''},
            'lez':{'name':u'Lezgi',            'localname':u'Лезги'},
            'min':{'name':u'Minangkabau',      'localname':u'Minangkabau'},
            'hak':{'name':u'Hakka',            'localname':u'Hak-kâ-fa / 客家話'},
            'tyv':{'name':u'Tuva',             'localname':u'Тыва дыл'}}
 
 
#languages to process
lang_keys = ['en','ca','bat-smg']
lang_keys = lang_info.keys()
lang_keys.sort()
 
#optimize by caching iw-links
iw_cache = {}
en_labels = {}
 
#debug
max_words = -1 
 
prev_score = {}
 
#score colors
color10000 = 'BF5FFF'
color4000 = 'FF7F00'
color2000 = 'FFBE00'
color1000 = 'FFFF00'
color500  = 'BEFF00' 
color250  = '40FF00'
color100  = '00FF7D'
color0    = 'EFEFEF'
 
 
#format with spaces
def FormatNumber(s):
    r = []
    for i, c in enumerate(reversed(str(int(s)))):
        if i and i % 3 == 0:
            r.insert(0, ',')
        r.insert(0, c)
    return ''.join(r)
 
 
def GetPreviousScores():
 
    temp_path = "PreviousScores.txt"  
    if os.path.isfile(temp_path):
       temp_file = open(temp_path)
       for line in temp_file:
            tokens = line.split()
            prev_score[tokens[0]] = float(tokens[1])
       temp_file.close()
 
 
def GetArticle(item, wiki, lang):
    word = GetArticleInterwikiName(item, lang)
 
    if len(word) > 0:
        page = wikipedia.Page(wiki, word)
        article = page.get(get_redirect=True)
 
        if u'#REDIRECT' in article.upper():
            text_start = article.find('[[')
            text_end = article.find(']]', text_start)
            word = article[text_start+2:text_end]
            page = wikipedia.Page(wiki, word)
            article = page.get()
    else:
        article = ''
 
    return article
 
 
def GetArticleInterwikiName(item, lang):
 
    if item in iw_cache:
        iw_links = iw_cache[item]
    else:
        wikidata = wikipedia.Site('wikidata', 'wikidata')
        datapage = pywikibot.DataPage(wikidata, item)
        try:
            data_dict = datapage.get()
        except:
             print('Where is ' + item)
             return ''
 
        iw_links = data_dict[u'links']
        labels = data_dict[u'label']
        iw_cache[item] = iw_links
        if u'en' in labels:
            en_labels[item] = labels[u'en']
    lang_wiki = lang.replace("-","_") + u'wiki'
 
    if lang_wiki in iw_links:
        return iw_links[lang_wiki]
    else:
        return ''
 
 
def GetInterwikiLength(article):
 
    #calculate len of all interwiki links
    interwiki_len   = 0
    interwiki_last  = 0
    interwiki_colon = 0
    interwiki_nl    = 0
    interwiki_first = article.find(u'[[', interwiki_last)
    while interwiki_first > -1:    
        interwiki_last  = article.find(u']]', interwiki_first)
        interwiki_colon = article.find(u':',  interwiki_first)
        if interwiki_colon > -1 and interwiki_colon < interwiki_last:
           curlang = article[interwiki_first+2:interwiki_colon]
           if curlang in lang_info:
               interwiki_nl = article.find(u'\n', interwiki_last)
               if interwiki_nl > -1:
                  interwiki_len += (interwiki_nl - interwiki_first) + 1
               else:
                  interwiki_len += (interwiki_last - interwiki_first) + 2
        interwiki_first = article.find(u'[[', interwiki_last)
 
    return interwiki_len
 
 
def GetCommentLength(article):
 
    #calculate len of all comments
    comment_len   = 0
    comment_last  = 0
    comment_first = article.find(u'<!--', comment_last)
    while comment_first > -1:    
        comment_last = article.find(u'-->', comment_first)
        if comment_last == -1:
           comment_last = comment_first + 4
 
        comment_len += (comment_last - comment_first) - 4
        comment_first = article.find(u'<!--', comment_last)
 
    return comment_len
 
def IsArticleEnglish(article):
 
    #remove comments
    comments = re.compile(r'<!--(.|\n|\r)*?-->')
    article = comments.sub("", article)
 
    #remove references
    refs = re.compile(r'<ref(.|\n|\r)*?</ref>')
    article = refs.sub("", article)
 
    # convert article to lower case word list
    word_list = article.lower().split()
 
    if len(word_list) == 0:
        return False
 
    # create dictionary of word:frequency pairs
    freq_dic = {}
 
    # punctuation marks to be removed
    punctuation = re.compile(r'[.?!,":;]') 
    for word in word_list:
        word = punctuation.sub("", word)
        if word in freq_dic: 
            freq_dic[word] += 1
        else: 
            freq_dic[word] = 1
 
    # usually English is ~30% these words and non-English at most a few percent
    common_english_words = ['the','of','on','a','is','in','his','have','by','but','that','to','with','for',
                            'an','from''are','was','he','which','be','as','it','this','first', 'new', 'and',
                            'she','also','after','at','become','best','from','had','great', 'into','their',
                            'these','they','time','who','her','not','one','or', 'made', 'would','are','between']
    en_word_count = 0
    for word in common_english_words:
        if word in freq_dic:
            en_word_count += freq_dic[word]
 
    percent_thats_common_english = 100.0 * en_word_count / len(word_list)
 
    # flag if 20% or more in the list which means more than half the article is English 
    if percent_thats_common_english > 20 and  en_word_count > 20:
        print "Percent %f, %d out of %d" % (percent_thats_common_english, en_word_count, len(word_list))
        return True
    return False
 
 
 
def GetArticleType(wt_article_size):
   if wt_article_size == 0:
      return 'absent'
   elif 0 < wt_article_size < 10000:
      return 'stubs'
   elif 10000 <= wt_article_size < 30000:
      return 'articles'
   elif wt_article_size >= 30000:
      return 'longarticles'
 
def GetScoreForLang(lang):
    absent       = lang_info[lang]['absent']
    stubs        = lang_info[lang]['stubs']
    articles     = lang_info[lang]['articles']
    longarticles = lang_info[lang]['longarticles']
    return GetScore(absent, stubs, articles, longarticles)
 
def GetScore(absent, stubs, articles, longarticles):
    max_count = absent + stubs + articles + longarticles
    max_score = max_count * 9
    raw_score = stubs + (articles*4) + (longarticles*9)
    if max_score > 0:
        score = 100.0 * raw_score / max_score
    else:
        score = 0
    return score
 
def GetLink(subtable,lang,value):
    return '[[/'+subtable+'#' + lang +' '+lang_info[lang]['localname']+ '|' + value + ']]'
 
def GetTableNumber(count, min_subtable_count, max_subtable_count0, subtable, lang, max_subtable_count40=0):
    value = FormatNumber(count)
 
    max_subtable_count = max_subtable_count0
    if GetScoreForLang(lang) > 40 and max_subtable_count40 > 0:
        max_subtable_count = max_subtable_count40
 
    if count >= min_subtable_count and (count <= max_subtable_count or max_subtable_count==-1):
       return GetLink(subtable,lang,value)
    else:
       return value
 
 
num_lang = 0
 
def CalculateStatistics():
    for lang in lang_keys:
        CalculateStatisticsForLang(lang)
 
def GetWeightForLang(lang):
    lang_weight = 1.0
    if 'weight' in lang_info[lang]:
        lang_weight = lang_info[lang]['weight']
    elif 'similar_lang' in lang_info[lang]:
        lang_weight = lang_info[lang_info[lang]['similar_lang']]['weight']
    return lang_weight
 
def CalculateStatisticsForLang(lang):
 
    global num_lang
    num_lang += 1
 
    print ('=['+lang+' '+str(num_lang)+ '/' + str(len(lang_keys)) + ']').ljust(76,'=')
 
    try:
 
        lang_info[lang]['total_size']   = 0
        lang_info[lang]['absent']       = 0
        lang_info[lang]['stubs']        = 0
        lang_info[lang]['articles']     = 0
        lang_info[lang]['longarticles'] = 0
 
        lang_info[lang]['art_count']    = 0
 
        temp_path = "~%s_output.txt" % (lang)
        if os.path.isfile(temp_path):
 
            temp_file = open(temp_path)
 
            art_count = int(temp_file.readline())
            lang_info[lang]['art_count']    = art_count  
            for index in range(art_count):
                artKey = 'art_'+str(index)
                lang_info[lang][artKey] = {}
                lang_info[lang][artKey]['item']  = temp_file.readline().decode('utf_8').strip()
                lang_info[lang][artKey]['name']  = temp_file.readline().decode('utf_8').strip()
                lang_info[lang][artKey]['size']  = int(temp_file.readline())
                lang_info[lang][artKey]['error'] = temp_file.readline().decode('utf_8').strip()
 
            temp_file.close()
 
            print '..using previous %s result...' % (lang)
 
        else:        
 
            wiki = wikipedia.Site(lang, 'wikipedia')
 
            item_file = open("ItemList.txt")
            word_count = 0
 
            for line in item_file:
 
                word_count += 1
                if word_count > max_words > 0:
                    break
 
                item = line[:-1].decode('utf_8')
                article_size = 0
                error = ''
 
                try:
                    article          = GetArticle(item, wiki, lang)  
                    raw_article_size = len(article)
 
                    interwiki_len = GetInterwikiLength(article)
                    comment_len   = GetCommentLength(article)
                    article_size  = (raw_article_size - interwiki_len - comment_len)
 
                    if lang != "en" and lang != 'simple' and lang != 'sco' and IsArticleEnglish(article):
                        raise TypeError ("Wrong language, [[%s:%s]] has too much untranslated English." % (lang, GetArticleInterwikiName(item, lang).encode("utf-8")))
                    lang_weight = GetWeightForLang(lang)
                    print str(lang).ljust(3), str(word_count).rjust(3), item.ljust(30),
                    print str(article_size * lang_weight).rjust(11), str(lang_weight).rjust(5), str(interwiki_len).rjust(9), str(comment_len).rjust(9)
 
                except KeyboardInterrupt:
                    sys.exit(1)
 
                except Exception:
                    e = sys.exc_info()[1]
                    sys.stderr.write('\n')
                    traceback.print_exc()
                    sys.stderr.write('\n')
                    try:
                        error = CookString(unicode(str(e),'utf-8'))
                    except:
                        error = "Error."
 
                art_index = lang_info[lang]['art_count']
                artKey = 'art_'+str(art_index)
                lang_info[lang][artKey] = {}
                lang_info[lang][artKey]['item'] = item
                if item in en_labels:
                    lang_info[lang][artKey]['name'] = en_labels[item]
                else:
                    lang_info[lang][artKey]['name'] = item
                lang_info[lang][artKey]['size'] = article_size
                lang_info[lang][artKey]['error'] = error
                lang_info[lang]['art_count'] = art_index + 1  
 
            item_file.close()
 
            temp_file = open(temp_path,'w')
            temp_file.write(str(lang_info[lang]['art_count'])+'\n')
            for index in range(lang_info[lang]['art_count']):
                artKey = 'art_'+str(index)
                temp_file.write(lang_info[lang][artKey]['item'].encode('utf_8')+'\n')
                temp_file.write(lang_info[lang][artKey]['name'].encode('utf_8')+'\n')
                temp_file.write(str(lang_info[lang][artKey]['size'])+'\n')
                temp_file.write(lang_info[lang][artKey]['error'].encode('utf_8')+'\n')
            temp_file.close()
 
        for index in range(lang_info[lang]['art_count']):
            artKey = 'art_'+str(index)
            article_size    = lang_info[lang][artKey]['size']
            wt_article_size = article_size * GetWeightForLang(lang)
            article_type    = GetArticleType(wt_article_size)
            if not lang_info[lang][artKey]['error']:
                lang_info[lang][article_type] = lang_info[lang][article_type] + 1
                lang_info[lang]['total_size'] = lang_info[lang]['total_size'] + article_size
 
    except:
        sys.stderr.write('\n')
        traceback.print_exc()
        sys.stderr.write('\n')
 
 
 
def GetGrowthNumber(lang, score):
    if lang in prev_score:
        return score - prev_score[lang]
 
def GetGrowth(lang, score):
    if lang in prev_score:
       growth    = "%+2.2f" % round(GetGrowthNumber(lang, score),2)
    else:
       growth    = "n/a"
    if growth == '-0.00':
       growth = '+0.00' 
    return growth
 
 
def GetAverageSize(lang, article_count):
    if article_count > 0:
       avg_size = int(round(lang_info[lang]['total_size'] / article_count))
    else:
       avg_size = 0
    return int(avg_size * GetWeightForLang(lang))
 
 
 
def GetMedianSize(lang):
    x = []
    art_count = lang_info[lang]['art_count']  
    for index in range(art_count):
        artKey = 'art_'+str(index)
        size = lang_info[lang][artKey]['size']
        if size > 0:
            x.append(size)
    x.sort()
    mid = len(x)/2
 
    median_size = 0
    if len(x) > 0:
        if len(x) % 2:
            median_size = x[mid]    
        else:
            median_size = (x[mid-1] + x[mid]) / 2
    return int(median_size * GetWeightForLang(lang))
 
def PrintResults():
 
    lang_keys.sort(key=GetScoreForLang, reverse=True)
 
    print '\n'
    print 'RESULTS\n----------------------------------------------------------------------'
    print u'Lang:',' AvgSize','Median','Absent',' <10k ','10-30k',' >30k ', 'Score', 'Growth'
    for lang in lang_keys:
 
        absent        = lang_info[lang]['absent']
        stubs         = lang_info[lang]['stubs']
        articles      = lang_info[lang]['articles']
        longarticles  = lang_info[lang]['longarticles']
 
        article_count = stubs + articles + longarticles
        score         = GetScore(absent, stubs, articles, longarticles)
        growth        = GetGrowth(lang, score)
        avg_size      = GetAverageSize(lang, article_count)
        med_size      = GetMedianSize(lang)
 
        print lang.ljust(6),
        print str(avg_size).rjust(7),
        print str(med_size).rjust(7),
        print str(absent).rjust(5),
        print str(stubs).rjust(6),
        print str(articles).rjust(6),
        print str(longarticles).rjust(6),
        print ("%6.2f" % score).rjust(6),
        print growth.rjust(6)
 
def GetWikiTableResults(awards):
 
    lang_keys.sort(key=GetScoreForLang, reverse=True)
 
    table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
    table += '\n|-\n'
    table += u'!width = 45 | № !! width = 55 | Wiki !! width = 220 | Language !! width = 55 | [[Talk:List of Wikipedias by sample of articles/Archives/2007#Proposed weighting of characters for formula_.28Option.232_using_Babel_text.29|Weight]] !! width = 120 | Mean Article<br>Size !! width = 120 | [[Talk:List_of_Wikipedias_by_sample_of_articles#average_or_median.3F|Median Article<br>Size]] !! width = 80 | [[/Absent Articles|Absent]]<br>(0k) !! width=80| Stubs<br>(< 10k)!! width = 80 | Articles<br>(10-30k) !! width = 80 | Long Art.<br>(> 30k) !! width = 80 | [[Talk:List of Wikipedias by sample of articles/Archives/2008#Other possibility of maximum score|Score]]'
    table += '!! width = 50 | [[Talk:List of Wikipedias by sample of articles/Archives/2008#Script_extension|Growth]]'
    table += '\n|-\n'
    i=0
    for lang in lang_keys:
        i += 1
 
        absent        = lang_info[lang]['absent']
        stubs         = lang_info[lang]['stubs']
        articles      = lang_info[lang]['articles']
        longarticles  = lang_info[lang]['longarticles']
 
        article_count = stubs + articles + longarticles
 
        dagger = u'†'
        if absent + article_count == 0:
            lang_footnote = dagger
            absent = lang_info['en']['art_count']
        else:
            lang_footnote = ''
 
        table += '|' + str(i) + '\n'
        table += '| [[:' + lang + ':|' + lang + ']]' + lang_footnote + '\n'
        table += '| style = "text-align: left" | [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]\n'
 
        if 'weight' in lang_info[lang]:
            weight = str(lang_info[lang]['weight'])
        elif 'similar_lang' in lang_info[lang]:
            weight = str(lang_info[lang_info[lang]['similar_lang']]['weight']) + '**' 
        else:
            weight = '1.0*' 
 
        score         = GetScore(absent, stubs, articles, longarticles)
        growth        = GetGrowth(lang, score)
        avg_size      = GetAverageSize(lang, article_count)
        med_size      = GetMedianSize(lang)
 
 
        if HasAwards(awards, lang):
            growth = GetLink('Growth',lang, growth)
 
        table += '| ' + weight + '\n'
        table += '| ' + GetTableNumber(avg_size,     1,  -1,'Neglected',       lang) + '\n'
        table += '| ' + FormatNumber(med_size) + '\n'
        table += '| ' + GetTableNumber(absent,       1, 250,'Absent Articles', lang) + '\n'
        table += '| ' + GetTableNumber(stubs,        1, 100,'Stubs',           lang, 250) + '\n'
        table += '| ' + GetTableNumber(articles,     1, 100,'Articles',        lang, 250) + '\n'
        table += '| ' + GetTableNumber(longarticles, 1, 100,'Long Articles',   lang) + '\n'
 
        #color code score
        if score >= 100.00:    
            color = "|style = \"background: "+u'\u0023'+color10000+"\""
        elif score >= 40.00:    
            color = "|style = \"background: "+u'\u0023'+color4000+"\""
        elif score >= 20.00:
            color = "|style = \"background: "+u'\u0023'+color2000+"\""
        elif score >= 10.00:
            color = "|style = \"background: "+u'\u0023'+color1000+"\""
        elif score >= 5.00:
            color = "|style = \"background: "+u'\u0023'+color500+"\""
        elif score >= 2.50:
            color = "|style = \"background: "+u'\u0023'+color250+"\""
        elif score >= 1.00:
            color = "|style = \"background: "+u'\u0023'+color100+"\""
        else:
            color = "|style = \"background: "+u'\u0023'+color0+"\""
 
        table += color + '| ' + ("%.2f" % score) + '\n'
        table += '| ' + growth + '\n'
        table += '|-\n'
 
    table = table[:-2] + '}'
    return table
 
def GetWikiTableArticles(article_type, min_articles, max_articles_0, max_articles_40=0):
    lang_keys.sort()
    table = u''
    i=0
    for lang in lang_keys:
        i += 1
        count=0
 
        max_articles = max_articles_0
        score = GetScoreForLang(lang)
        if score > 40 and max_articles_40 > 0:
            max_articles = max_articles_40
 
        section = u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
        for index in range(lang_info[lang]['art_count']):
            artKey  = 'art_'+str(index)
            artWtSize = GetArticleSize(lang, artKey)
            artType = GetArticleType(artWtSize)
            if artType == article_type:
               section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
               count += 1
        if min_articles <= count <= max_articles:
            table += section
 
    return table
 
 
def GetArticleName(lang, artKey):
 
    if artKey in lang_info[lang]:
       return lang_info[lang][artKey]['name']
    else:
       return 0
 
 
def GetArticleSize(lang, artKey):
 
    if artKey in lang_info[lang]:
       if lang_info[lang][artKey]['error'] :
          return 0;
       return lang_info[lang][artKey]['size'] * GetWeightForLang(lang)
    else:
       return 0
 
def GetEdgeFactor(lang, artKey):
 
    size = GetArticleSize(lang, artKey)
    if size==0:
        return 1
    if 7000 < size < 1000:
        return (size - 7000) / 1000
    if 24000 < size < 30000:
        return (size - 24000) / 1000
    else:
        return 0
 
def GetRuntFactor(lang, artKey):
 
    size = GetArticleSize(lang, artKey)
 
    if size > 0:
        for index in range(lang_info['en']['art_count']):
            otherArtKey =  'art_'+str(index)
            if otherArtKey != artKey:
               otherSize = GetArticleSize(lang, otherArtKey)
               if 0 < otherSize < size:
                   return 0 #you are not the runt
        return 4
    return 0
 
 
def GetArticlePoints(lang, artKey):
 
    size = GetArticleSize(lang, artKey)
    if size > 0 and size < 10000:
       return 1
    elif size > 10000 and size < 30000:
       return 4
    elif size > 30000:
       return 9
    return 0
 
 
def GetAverageArticlePoints(artKey):
    total = sum(GetArticlePoints(lang, artKey) for lang in lang_keys)
    return float(total) / len(lang_keys)
 
 
def GetAverageArticleSize(artKey):
    total = sum(GetArticleSize(lang, artKey) for lang in lang_keys)
    return int(float(total) / len(lang_keys))
 
 
def GetNeglectForArticle(lang, artInfo):
    artKey = artInfo['artKey']
    avgPnts = GetAverageArticlePoints(artKey) #0 to 9
    pnts = GetArticlePoints(lang, artKey)     #0 to 9
    edgeFactor = GetEdgeFactor(lang, artKey)  #0 to 6
    runtFactor = GetRuntFactor(lang, artKey)  #0 to 4
    return avgPnts - pnts + edgeFactor + runtFactor
 
def GetArticlesSortedByNeglect(lang):
    artInfos = []
 
    if 'art_count' in lang_info['en']:
      for index in range(lang_info['en']['art_count']):
        artKey =  'art_'+str(index)
        artInfos.append( {} )
        artInfos[index]['artKey']  = artKey
        artInfos[index]['popularity']  = GetAverageArticleSize(artKey)
        artInfos[index]['neglect'] = GetNeglectForArticle(lang, artInfos[index])
    artInfos.sort(key=lambda x: (x['neglect'], x['popularity']), reverse=True)
    return artInfos
 
def GetLargestArticles(artKey, maxLangs):
 
        lang_keys = lang_info.keys()
        lang_keys.sort(key=lambda lang: GetArticleSize(lang, artKey), reverse=True)
 
        item = lang_info['en'][artKey]['item']
 
        ret = []
        for lang in lang_keys[0:maxLangs]:
           ret.append ( '[['+lang+':'+GetArticleInterwikiName(item, lang)+'|'+lang+':'+FormatNumber(GetArticleSize(lang, artKey))+']]' )
        return ' -- '.join(ret)
 
def GetArticleTypeCount(artKey,points):
 
        return len([lang for lang in lang_keys if GetArticlePoints(lang, artKey) == points])
 
 
def GetNeglectedArticles(lang, max_articles):
 
    artInfos = GetArticlesSortedByNeglect(lang)
 
    i=0
    table = u''
    for artInfo in artInfos:
 
       if artInfo['artKey'] in lang_info[lang]:
 
           item = lang_info[lang][artInfo['artKey']]['item']
           name = lang_info[lang][artInfo['artKey']]['name']
           table += '#[[d:'+item+'|'+name+']]'
 
           size = int(GetArticleSize(lang, artInfo['artKey']))
           if size > 0:
               iw_name = GetArticleInterwikiName(item, lang)
               if iw_name == '':
                   table += ' ('+str(size) + ')'
               else:
                   iw_link = lang+':'+iw_name
                   table += ' ([['+iw_link+'|'+str(size)+']])'
 
           table += '\n'
 
       i+=1
       if i >= max_articles: break
 
    return table
 
 
def GetPopularArticles(max_articles):
 
    artInfos = GetArticlesSortedByNeglect('en')
    artInfos.sort(key=lambda x: x['popularity'], reverse=True)
 
    i=0
 
    table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
    table += '\n|-\n'
    table += u'!width = 45 | № !! width = 90 | Average Size !! width = 150 | Article Name !! width = 80 | [[Talk:List of Wikipedias by sample of articles#Article metric|Absent<br>(0k)]] !! width=80| Stubs<br>(< 10k)!! width = 80 | Articles<br>(10-30k) !! width = 80 | Long Art.<br>(> 30k) !! width = 150 | Largest Articles\n'
 
    for artInfo in artInfos:
       i+=1
       artKey = artInfo['artKey']
       table += '|-\n'
       table += '|' + str(i)
       table += '||'+FormatNumber(artInfo['popularity']) 
       table += '||style="text-align:left"|[[d:'+lang_info['en'][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']]' 
       table += '||'+str(GetArticleTypeCount(artKey,0))
       table += '||'+str(GetArticleTypeCount(artKey,1))
       table += '||'+str(GetArticleTypeCount(artKey,4))
       table += '||'+str(GetArticleTypeCount(artKey,9))
       table += '||'+GetLargestArticles(artKey,4)+'\n'
       if i >= max_articles > 0: break
 
    table += '|}\n'
 
    return table
 
 
def GetWikiNeglectedArticles():
    lang_keys.sort()
    table = u''
 
    print 'writing Popular Articles...'
    table += u'==Popular Articles==\n'
    table += GetPopularArticles(-1)
 
    print 'writing Neglected Articles...'
    table += u'==Neglected Articles==\n'
    for lang in lang_keys:
        print ' '+lang
 
        if lang_info[lang]['art_count'] > 0:
            table += u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
            table += GetNeglectedArticles(lang, 10)
 
        has_errors = False
        section = u'====Errors====\n'
        for index in range(lang_info[lang]['art_count']):
            artKey  = 'art_'+str(index)
            if lang_info[lang][artKey]['error']  :
               section = section + '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
               has_errors = True
        if has_errors:
            table = table + section
 
    return table
 
 
 
def SaveWikiTableResults(awards):
 
    print 'writing Results...'
    f = open('results.txt', 'w')
    f.write(GetWikiTableResults(awards).encode("utf_8"))
    f.close()
 
    print 'writing Absent...'
    f = open('_absent.txt', 'w')
    f.write(GetWikiTableArticles('absent',1, 250).encode("utf_8"))
    f.close()
 
    print 'writing Stubs...'
    f = open('_stub.txt', 'w')
    f.write(GetWikiTableArticles('stubs',1, 100, 250).encode("utf_8"))
    f.close()
 
    print 'writing Articles...'
    f = open('_articles.txt', 'w')
    f.write(GetWikiTableArticles('articles',1, 100, 250).encode("utf_8"))
    f.close()
 
    print 'writing Long Articles...'
    f = open('_longarticles.txt', 'w')
    f.write(GetWikiTableArticles('longarticles',1,100).encode("utf_8"))
    f.close()
 
    print 'writing Awards...'
    f = open('_growth.txt', 'w')
    f.write(GetWikiAwards(awards).encode("utf_8"))
    f.close()
 
    print 'writing Suggestions...'
    f = open('_neglectedarticles.txt', 'w')
    f.write(GetWikiNeglectedArticles().encode("utf_8"))
    f.close()
 
 
 
def CookString(rawString):
 
    cookString = ''
    for part in rawString.replace("'","||").split("|"):
        if len(part)==0:
            cookString += "'"
        else:
            cookString += eval("u'"+part+"'")
    return cookString        
 
def GetGrowths(article):
    growths = {}
    lang_last  = 0
    lang_first = article.find(u'[[:', lang_last)
    while lang_first > -1:
        lang_last  = article.find(u'|',  lang_first)
        if lang_last == -1:
            break
        lang = article[lang_first+3:lang_last-1]
        score_first = article.find(u'style = "background:',lang_last)
        if score_first == -1:
            break
        score_last  = article.find(u'|', score_first+32)
        if score_last == -1:
            break
        growth_end = article.find(u'\n', score_last)
        growth_str = article[score_last+2:growth_end]
        try:
           growth_pipe = growth_str.find(u'|') 
           if growth_pipe > -1:
              growth_str = growth_str[growth_pipe+1:-2]
           if growth_str.find(u' ‡') > -1:
              growth_str = growth_str[0:-2]
           growth = float(growth_str)
        except:
           growth = 0 
        growths[lang]=growth
        lang_first = article.find(u'[[:', score_last)
    return growths
 
def GetLastUpdated(article):
    date_first = article.find(u'Last Update')
    if date_first > -1:
       date_last_paren = article.find(u'(', date_first)
       date_last_br = article.find(u'<br/>', date_first)
 
       if date_last_paren > -1 and date_last_paren < date_last_br :
          date_last = date_last_paren
       else:
          date_last = date_last_br
       if date_last > -1:
           hyphen = article.find(u'-', date_first,date_last)
           if hyphen > -1:
               date_first = hyphen+1
           else:
               date_first += 12
 
           parts = article[date_first:date_last].strip().split(' ')
           if len(parts[0])==1:
              parts[0] = '0'+parts[0]
           if parts[0][0]==':':
              parts[0] = '0'+parts[0][1]
           parts[1] = parts[1][0:3]
           return ' '.join(parts)
 
growthsG = {}
 
def CalculatePlacing(growths,oldid,update):
    global growthsG
    growthsG = growths
    lang_keys = growths.keys()
    lang_keys.sort(key=lambda x: growthsG[x], reverse=True)
    placeNo=0
 
    print update
 
    placing = []
    for lang in lang_keys:
        if placeNo < 3 or growths[lang] > 1:
           placeNo += 1
           if placeNo==1:
              placestr = '1st Place'
              ribbonimg = 'Article blue.svg'
           elif placeNo==2:
              placestr = '2nd Place'
              ribbonimg = 'Article red.svg'
           elif placeNo==3:
              placestr = '3rd Place'
              ribbonimg = 'Article yellow.svg'
           elif placeNo>3:
              placestr = 'Honorable Mention'
              ribbonimg = 'Article green.svg'
 
           print " %d  %-3s %+2.2f" % (placeNo, lang, growths[lang])
 
           place = {'lang':lang,'growth':growths[lang],'oldid':oldid,'update':update,'placestr':placestr,'ribbonimg':ribbonimg}
           placing.append(place) 
    return placing
 
 
def GetPreviousAwards():
 
    article_name = 'List of Wikipedias by sample of articles'
 
    meta_wiki = wikipedia.Site('meta', 'meta')
    meta_page = wikipedia.Page(meta_wiki, article_name)
 
    awards = {}
    prevUpdate = ''
    prevGrowth = -999
 
    for oldid,datetime,username,comments,cursize,curx in meta_page.getVersionHistory():
        if ('2009' in datetime or '2010' in datetime or '2011' in datetime or '2012' in datetime or '2013' in datetime) and ("updat" in comments.lower() or 'correct' in comments.lower()) and oldid!=2228213 and oldid!=2264612 and oldid!=3122655 and oldid!=3359817:
            article   = meta_page.getOldVersion(get_redirect=False,oldid=oldid)
            growths = GetGrowths(article)
            if 'en' in growths:
                update = GetLastUpdated(article)
                growth = growths['en']
                if update != prevUpdate and ( prevGrowth != growth or oldid > 3807780 ):
                    prevUpdate = update
                    prevGrowth = growth
                    awards[update] = CalculatePlacing(growths,oldid,update)
    return awards                
 
def HasAwards(awards, lang):
 
    for placings in awards.values():
        for place in placings:
            if lang == place['lang']:
                return True
    return False
 
def CompareRows(rowA,rowB):
    if rowA['place']['placestr']==rowB['place']['placestr']:
        return cmp(rowB['place']['growth'],rowA['place']['growth'])
    return cmp(rowA['place']['placestr'],rowB['place']['placestr'])
 
def GetWikiAwards(awards):
    table = u'==2009-2013 Improvement Awards==\n'
    for lang in lang_keys:
        section = u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
        rows = []
        for update, placings in awards.items():
           for place in placings:
               if lang == place['lang']:
                  mid_section = '|-\n'
                  mid_section += '|width = 150 | [[Image:%s|20px]] %s\n' % (place['ribbonimg'],place['placestr'])
                  if place['oldid'] == -1:  
                      mid_section += '|width = 120 align=center| [[:m:List of Wikipedias by sample of articles|%s]]\n' % (place['update'])
                  else:  
                      mid_section += '|width = 120 align=center| <span class="plainlinks">[http://meta.wikimedia.org/w/index.php?title=List_of_Wikipedias_by_sample_of_articles&oldid=%s %s]</span>\n' % (place['oldid'],place['update'])
                  mid_section += '|width = 80 align=center| %+2.2f\n' % round(place['growth'],2)
                  rows.append({'place':place,'mid_section':mid_section})
        if len(rows) > 0:
            rows.sort(CompareRows)
            if len(rows) > 1:
                section += '{|class="wikitable sortable" cellpadding="6" cellspacing="0"\n'
                section += '! !! !!\n'
            else:
                section += '{|class="wikitable" cellpadding="6" cellspacing="0"\n'
            for row in rows:
                section += row['mid_section']
            section += '|}\n'
            table += section
    return table
 
def CalculateAwards():
 
    print "calculating awards..."
 
    todays = {}
    for lang in lang_keys:
        absent        = lang_info[lang]['absent']
        stubs         = lang_info[lang]['stubs']
        articles      = lang_info[lang]['articles']
        longarticles  = lang_info[lang]['longarticles']
        score         = GetScore(absent, stubs, articles, longarticles)
        growth        = GetGrowthNumber(lang, score)
        todays[lang] = growth
 
    update = strftime("%d %b %Y")
    placing = CalculatePlacing(todays,-1,update)
 
    awards = GetPreviousAwards()
    awards[update] = placing
    return awards
 
#support dividing up work
if len(sys.argv) == 3:
    part      = int(sys.argv[1])-1
    numparts  = int(sys.argv[2])
    lang_keys = filter(lambda lang: lang_keys.index(lang) % numparts == part, lang_keys)
 
 
GetPreviousScores()
CalculateStatistics()
awards = CalculateAwards()
PrintResults()
SaveWikiTableResults(awards)

GetItemList.py[edit]

# -*- coding: utf_8 -*-
import sys
sys.path.append('./pywikipedia')
 
import wikipedia
import pagegenerators
import catlib
import traceback
import os
 
article_name = 'List of articles every Wikipedia should have'
 
meta_wiki = wikipedia.Site('meta', 'meta')
meta_page = wikipedia.Page(meta_wiki, article_name)
article   = meta_page.get(get_redirect=False)
 
f = open('ItemList.txt', 'w')
count = 0
grand_total = 0
 
name_last  = 0
name_first = article.find(u'[[d:', name_last)
while name_first > -1:
    name_mid  = article.find(u'|',  name_first)
 
 
    cat_start =article.rfind(u'\n== ', name_last, name_first)
    if cat_start > -1:
        cat_end   = article.find(u'==',cat_start+3, name_first)
        if cat_end > -1: 
            cat   = article[cat_start+3:cat_end]
            print
            print cat
            print ''.center(len(cat),'-')
            count = 0
 
    name_last = article.find(u']]', name_first)
    if name_last > name_mid:
      name_last = name_mid
    article_item = article[name_first+4:name_last]
    f.write(article_item.encode("utf_8"))
    f.write('\n')
    count += 1
    grand_total += 1
    print count, article_item
    name_first = article.find(u'[[d:', name_last)
 
f.close()
 
print ''
print 'GRAND TOTAL'
print '-----------'
print  grand_total, 'articles'

GetPreviousScores.py[edit]

# -*- coding: utf_8 -*-
import sys
sys.path.append('./pywikipedia/')
 
import wikipedia
import pagegenerators
import catlib
import traceback
import os
 
article_name = 'List of Wikipedias by sample of articles'
 
meta_wiki = wikipedia.Site('meta', 'meta')
meta_page = wikipedia.Page(meta_wiki, article_name)
article   = meta_page.get(get_redirect=False)
 
f = open('PreviousScores.txt', 'w')
count = 0
lang_last  = 0
lang_first = article.find(u'[[:', lang_last)
while lang_first > -1:
    lang_last  = article.find(u'|',  lang_first)
 
    lang = article[lang_first+3:lang_last-1]
 
    score_first = article.find(u'style = "background:',lang_last)
    score_last  = article.find(u'|', score_first+32)
 
    score = article[score_first+31:score_last-1]
 
    f.write(lang + ' ' + score + '\n')
 
    count += 1
    print count, lang, score
    lang_first = article.find(u'[[:', score_last)
 
f.close()