List of Wikipedias by expanded sample of articles/Source code

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search

This code is a fork from List of Wikipedias by sample of articles/Source code, and is used for calculating List of Wikipedias by expanded sample of articles.

ListExpandedSample.py[edit]

# -*- coding: utf_8 -*-
import sys
sys.path.append('./pywikipedia/core')
sys.path.append('./requests/requests')

import pywikibot
from pywikibot import pagegenerators
#from pywikibot import catlib
import traceback
import os
import re
#import query
from pywikibot.data import api
import sys
import simplejson as json
from time import strftime
from sets import Set
from datetime import date

#language information
lang_info ={'en': {'name':u'English',     'localname':u'English',  'weight': 1.000},
            'de': {'name':u'German',      'localname':u'Deutsch',  'weight':0.894},
            'fr': {'name':u'French',      'localname':u'Français', 'weight':0.894},
            'pl': {'name':u'Polish',      'localname':u'Polski',   'weight':0.956},
            'ja': {'name':u'Japanese',    'localname':u'日本語',    'weight':2.551},
            'it': {'name':u'Italian',     'localname':u'Italiano', 'weight':0.891},
            'nl': {'name':u'Dutch',       'localname':u'Nederlands', 'weight':0.833},
            'pt': {'name':u'Portuguese',  'localname':u'Português', 'weight':0.937},
            'es': {'name':u'Spanish',     'localname':u'Español',  'weight':0.897},
            'sv': {'name':u'Swedish',     'localname':u'Svenska',  'weight':1.004},
            'ru': {'name':u'Russian',     'localname':u'Русский',  'weight':0.908},
            'zh': {'name':u'Chinese',     'localname':u'中文',      'weight':3.786},
            'no': {'name':u'Norwegian (Bokmål)','localname':u'Norsk (Bokmål)', 'weight':1.042},
            'fi': {'name':u'Finnish',     'localname':u'Suomi', 'weight':0.958},
            'vo': {'name':u'Volapük',     'localname':u'Volapük'},
            'ca': {'name':u'Catalan',     'localname':u'Català', 'weight':0.971},
            'ro': {'name':u'Romanian',    'localname':u'Română', 'weight':0.894},
            'tr': {'name':u'Turkish',     'localname':u'Türkçe', 'weight':1.034},
            'uk': {'name':u'Ukrainian',   'localname':u'Українська', 'weight':0.994},
            'eo': {'name':u'Esperanto',   'localname':u'Esperanto', 'weight':1.074},
            'cs': {'name':u'Czech',       'localname':u'Čeština', 'weight':1.083},
            'hu': {'name':u'Hungarian',   'localname':u'Magyar', 'weight':0.884},
            'sk': {'name':u'Slovak',      'localname':u'Slovenčina', 'weight':1.054},
            'da': {'name':u'Danish',      'localname':u'Dansk', 'weight':0.978},
            'id': {'name':u'Indonesian',  'localname':u'Bahasa Indonesia', 'weight':0.851},
            'he': {'name':u'Hebrew',      'localname':u'עברית', 'weight':1.466},
            'lt': {'name':u'Lithuanian',  'localname':u'Lietuvių', 'weight':0.977},
            'sr': {'name':u'Serbian',     'localname':u'Српски / Srpski', 'weight':1.121},
            'sl': {'name':u'Slovenian',   'localname':u'Slovenščina', 'weight':1.026},
            'ko': {'name':u'Korean',      'localname':u'한국어', 'weight':2.252},
            'ar': {'name':u'Arabic',      'localname':u'العربية', 'weight':1.408},
            'bg': {'name':u'Bulgarian',   'localname':u'Български', 'weight':0.935},
            'et': {'name':u'Estonian',    'localname':u'Eesti', 'weight':0.986},
            'hr': {'name':u'Croatian',    'localname':u'Hrvatski', 'weight':1.078},
            'new':{'name':u'Newar / Nepal Bhasa','localname':u'नेपाल भाषा'},
            'te': {'name':u'Telugu',      'localname':u'తెలుగు'},
            'vi': {'name':u'Vietnamese',  'localname':u'Tiếng Việt', 'weight':0.827},
            'th': {'name':u'Thai',        'localname':u'ไทย', 'weight':1.143},
            'gl': {'name':u'Galician',    'localname':u'Galego', 'weight':0.947},
            'fa': {'name':u'Persian',     'localname':u'فارسی', 'weight':1.167},
            'nn': {'name':u'Norwegian (Nynorsk)','localname':u'Nynorsk', 'similar_lang':'no'},
            'ceb':{'name':u'Cebuano',     'localname':u'Sinugboanong Binisaya', 'weight':0.873},
            'el': {'name':u'Greek',       'localname':u'Ελληνικά', 'weight':0.857},
            'ms': {'name':u'Malay',       'localname':u'Bahasa Melayu', 'weight':0.845},
            'simple':{'name':u'Simple English','localname':u'Simple English'},
            'eu': {'name':u'Basque',      'localname':u'Euskara', 'weight':0.967},
            'bpy':{'name':u'Bishnupriya Manipuri','localname':u'ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী'},
            'bs': {'name':u'Bosnian',     'localname':u'Bosanski', 'similar_lang':'hr'},
            'lb': {'name':u'Luxembourgish','localname':u'Lëtzebuergesch'},
            'is': {'name':u'Icelandic',   'localname':u'Íslenska', 'weight':1.041},
            'ka': {'name':u'Georgian',    'localname':u'ქართული'},
            'sq': {'name':u'Albanian',    'localname':u'Shqip'},
            'la': {'name':u'Latin',       'localname':u'Latina', 'weight':1.070},
            'br': {'name':u'Breton',      'localname':u'Brezhoneg'},
            'az': {'name':u'Azeri',       'localname':u'Azərbaycan', 'weight':1.2},
            'hi': {'name':u'Hindi',       'localname':u'हिन्दी', 'weight':0.978},
            'bn': {'name':u'Bengali',     'localname':u'বাংলা'},
            'ht': {'name':u'Haitian',     'localname':u'Krèyol ayisyen'},
            'mk': {'name':u'Macedonian',  'localname':u'Македонски', 'weight':0.995},
            'mr': {'name':u'Marathi',     'localname':u'मराठी'},
            'sh': {'name':u'Serbo-Croatian','localname':u'Srpskohrvatski / Српскохрватски', 'similar_lang':'hr'},
            'tl': {'name':u'Tagalog',     'localname':u'Tagalog'},
            'io': {'name':u'Ido',         'localname':u'Ido'},
            'cy': {'name':u'Welsh',       'localname':u'Cymraeg', 'weight':1.050},
            'pms':{'name':u'Piedmontese', 'localname':u'Piemontèis'},
            'lv': {'name':u'Latvian',     'localname':u'Latviešu', 'weight':1.017},
            'su': {'name':u'Sundanese',   'localname':u'Basa Sunda'},
            'ta': {'name':u'Tamil',       'localname':u'தமிழ்', 'weight':0.800},
            'jv': {'name':u'Javanese',    'localname':u'Basa Jawa'},
            'nap':{'name':u'Neapolitan',  'localname':u'Nnapulitano'},
            'oc': {'name':u'Occitan',     'localname':u'Occitan'},
            'nds':{'name':u'Low Saxon',   'localname':u'Plattdüütsch'},
            'scn':{'name':u'Sicilian',    'localname':u'Sicilianu'},
            'ast':{'name':u'Asturian',    'localname':u'Asturianu'},
            'ku': {'name':u'Kurdish',     'localname':u'Kurdî / كوردی'},
            'be': {'name':u'Belarusian',  'localname':u'Беларуская', 'weight':0.937},
            'wa': {'name':u'Walloon',     'localname':u'Walon'},
            'af': {'name':u'Afrikaans',   'localname':u'Afrikaans', 'weight':1.025},
            'be-x-old':{'name':u'Belarusian (Taraškievica)','localname':u'Беларуская (тарашкевіца)', 'weight':1.4},
            'tg': {'name':u'Tajik',       'localname':u'Тоҷикӣ'},
            'an': {'name':u'Aragonese',   'localname':u'Aragonés', 'weight':1.1},
            'fy': {'name':u'West Frisian','localname':u'Frysk'},
            'vec':{'name':u'Venetian',    'localname':u'Vèneto'},
            'roa-tara':{'name':u'Tarantino',   'localname':u'Tarandíne'},
            'cv': {'name':u'Chuvash',     'localname':u'Чăваш'},
            'zh-yue':{'name':u'Cantonese',   'localname':u'粵語', 'similar_lang':'zh'},
            'ur': {'name':u'Urdu',        'localname':u'اردو'},
            'ksh':{'name':u'Ripuarian',   'localname':u'Ripoarisch'},
            'sw': {'name':u'Swahili',     'localname':u'Kiswahili'},
            'qu': {'name':u'Quechua',     'localname':u'Runa Simi'},
            'uz': {'name':u'Uzbek',       'localname':u'O‘zbek'},
            'mi': {'name':u'Maori',       'localname':u'Māori'},
            'ga': {'name':u'Irish',       'localname':u'Gaeilge'},
            'bat-smg':{'name':u'Samogitian',  'localname':u'Žemaitėška'},
            'ml': {'name':u'Malayalam',   'localname':u'മലയാളം', 'weight':1.004},
            'gd': {'name':u'Scottish Gaelic','localname':u'Gàidhlig'},
            'yo': {'name':u'Yoruba',      'localname':u'Yorùbá'},
            'co': {'name':u'Corsican',    'localname':u'Corsu'},
            'kn': {'name':u'Kannada',     'localname':u'ಕನ್ನಡ', 'weight':0.999},
            'pam':{'name':u'Kapampangan', 'localname':u'Kapampangan'},
            'yi': {'name':u'Yiddish',     'localname':u'ייִדיש'},
            'hsb':{'name':u'Upper Sorbian','localname':u'Hornjoserbsce'},
            'nah':{'name':u'Nahuatl',     'localname':u'Nāhuatl'},
            'ia': {'name':u'Interlingua', 'localname':u'Interlingua', 'weight':1.0},
            'li': {'name':u'Limburgian',  'localname':u'Limburgs'},
            'sa': {'name':u'Sanskrit',    'localname':u'संस्कृतम्'},
            'hy': {'name':u'Armenian',    'localname':u'Հայերեն', 'weight':0.904},
            'tt': {'name':u'Tatar',       'localname':u'Tatarça / Татарча'},
            'als':{'name':u'Alemannic',   'localname':u'Alemannisch', 'weight':1.1},
            'roa-rup':{'name':u'Aromanian',   'localname':u'Armãneashce'},
            'lmo':{'name':u'Lombard',     'localname':u'Lumbaart'},
            'map-bms':{'name':u'Banyumasan',  'localname':u'Basa Banyumasan'},
            'am': {'name':u'Amharic',     'localname':u'አማርኛ'},
            'nrm':{'name':u'Norman',      'localname':u'Nouormand/Normaund'},
            'zh-min-nan':{'name':u'Min Nan',     'localname':u'Bân-lâm-gú', 'weight':1.2},
            'pag':{'name':u'Pangasinan',  'localname':u'Pangasinan'},
            'wuu':{'name':u'Wu',          'localname':u'吴语', 'similar_lang':'zh'},
            'fo': {'name':u'Faroese',     'localname':u'Føroyskt'},
            'vls':{'name':u'West Flemish','localname':u'West-Vlams'},
            'nds-nl':{'name':u'Dutch Low Saxon','localname':u'Nedersaksisch'},
            'se': {'name':u'Northern Sami','localname':u'Sámegiella'},
            'rm': {'name':u'Romansh',     'localname':u'Rumantsch'},
            'ne': {'name':u'Nepali',      'localname':u'नेपाली'},
            'war':{'name':u'Waray-Waray', 'localname':u'Winaray'},
            'fur':{'name':u'Friulian',    'localname':u'Furlan'},
            'lij':{'name':u'Ligurian',    'localname':u'Líguru'},
            'nov':{'name':u'Novial',      'localname':u'Novial'},
            'bh': {'name':u'Bihari',      'localname':u'भोजपुरी'},
            'sco':{'name':u'Scots',       'localname':u'Scots'},
            'dv': {'name':u'Divehi',      'localname':u'ދިވެހިބަސް'},
            'pi': {'name':u'Pali',        'localname':u'पाऴि'},
            'diq':{'name':u'Zazaki',      'localname':u'Zazaki'},
            'ilo':{'name':u'Ilokano',     'localname':u'Ilokano'},
            'kk': {'name':u'Kazakh',      'localname':u'Қазақша', 'weight':1.3},
            'os': {'name':u'Ossetian',    'localname':u'Иронау'},
            'frp':{'name':u'Franco-Provençal/Arpitan','localname':u'Arpitan'},
            'zh-classical':{'name':u'Classical Chinese','localname':u'古文 / 文言文', 'similar_lang':'zh'},
            'mt': {'name':u'Maltese',     'localname':u'Malti'},
            'lad':{'name':u'Ladino',      'localname':u'Dzhudezmo'},
            'fiu-vro':{'name':u'Võro',        'localname':u'Võro'},
            'pdc':{'name':u'Pennsylvania German','localname':u'Deitsch'},
            'csb':{'name':u'Kashubian',   'localname':u'Kaszëbsczi'},
            'kw': {'name':u'Cornish',     'localname':u'Kernewek'},
            'bar':{'name':u'Bavarian',    'localname':u'Boarisch'},
            'to': {'name':u'Tongan',      'localname':u'faka Tonga'},
            'haw':{'name':u'Hawaiian',    'localname':u'Hawai`i'},
            'mn': {'name':u'Mongolian',   'localname':u'Монгол'},
            'ps': {'name':u'Pashto',      'localname':u'پښتو'},
            'ang':{'name':u'Anglo-Saxon', 'localname':u'Englisc'},
            'km': {'name':u'Khmer',       'localname':u'ភាសាខ្មែរ'},
            'gv': {'name':u'Manx',        'localname':u'Gaelg'},
            'tk': {'name':u'Turkmen',     'localname':u'تركمن / Туркмен'},
            'ln': {'name':u'Lingala',     'localname':u'Lingala'},
            'ie': {'name':u'Interlingue', 'localname':u'Interlingue'},
            'tpi':{'name':u'Tok Pisin',   'localname':u'Tok Pisin'},
            'crh':{'name':u'Crimean Tatar','localname':u'Qırımtatarca'},
            'jbo':{'name':u'Lojban',      'localname':u'Lojban', 'weight':1.2},
            'wo': {'name':u'Wolof',       'localname':u'Wolof'},
            'ay': {'name':u'Aymara',      'localname':u'Aymar'},
            'zea':{'name':u'Zealandic',   'localname':u'Zeêuws'},
            'eml':{'name':u'Emilian-Romagnol','localname':u'Emiliàn e rumagnòl'},
            'si': {'name':u'Sinhalese',   'localname':u'සිංහල'},
            'sc': {'name':u'Sardinian',   'localname':u'Sardu'},
            'or': {'name':u'Oriya',       'localname':u'ଓଡ଼ିଆ'},
            'ig': {'name':u'Igbo',        'localname':u'Igbo'},
            'mg': {'name':u'Malagasy',    'localname':u'Malagasy'},
            'cbk-zam':{'name':u'Zamboanga Chavacano','localname':u'Chavacano de Zamboanga'},
            'gu': {'name':u'Gujarati',    'localname':u'ગુજરાતી'},
            'ky': {'name':u'Kirghiz',     'localname':u'Кыргызча'},
            'kg': {'name':u'Kongo',       'localname':u'KiKongo'},
            'ty': {'name':u'Tahitian',    'localname':u'Reo Mā`ohi'},
            'glk':{'name':u'Gilaki',      'localname':u'گیلکی'},
            'arc':{'name':u'Assyrian Neo-Aramaic','localname':u'ܐܪܡܝܐ'},
            'gn': {'name':u'Guarani',     'localname':u'Avañe\'ẽ'},
            'kab':{'name':u'Kabyle',      'localname':u'Taqbaylit'},
            'so': {'name':u'Somali',      'localname':u'Soomaaliga'},
            'ks': {'name':u'Kashmiri',    'localname':u'कश्मीरी / كشميري'},
            'stq':{'name':u'Saterland Frisian','localname':u'Seeltersk'},
            'mzn':{'name':u'Mazandarani', 'localname':u'مَزِروني'},
            'cu': {'name':u'Old Church Slavonic','localname':u'Словѣньскъ'},
            'ce': {'name':u'Chechen',     'localname':u'Нохчийн'},
            'udm':{'name':u'Udmurt',      'localname':u'Удмурт кыл'},
            'tet':{'name':u'Tetum',       'localname':u'Tetun'},
            'sd': {'name':u'Sindhi',      'localname':u'سنڌي، سندھی ، सिन्ध'},
            'pap':{'name':u'Papiamentu',  'localname':u'Papiamentu'},
            'ba': {'name':u'Bashkir',     'localname':u'Башҡорт', 'similar_lang':'kk'},
            'pa': {'name':u'Punjabi',     'localname':u'ਪੰਜਾਬੀ'},
            'rmy':{'name':u'Romani',      'localname':u'romani - रोमानी'},
            'lo': {'name':u'Lao',         'localname':u'ລາວ'},
            'na': {'name':u'Nauruan',     'localname':u'dorerin Naoero'},
            'bcl':{'name':u'Central Bicolano','localname':u'Bikol'},
            'kaa':{'name':u'Karakalpak',  'localname':u'Qaraqalpaq tili'},
            'gan':{'name':u'Gan',         'localname':u'贛語', 'similar_lang':'zh'},
            'iu': {'name':u'Inuktitut',   'localname':u'ᐃᓄᒃᑎᑐᑦ'},
            'myv':{'name':u'Erzya',       'localname':u'Эрзянь (Erzjanj Kelj)'},
            'szl':{'name':u'Silesian',    'localname':u'Ślůnski'},
            'sah':{'name':u'Sakha',       'localname':u'Саха тыла (Saxa Tyla)'},
            'my': {'name':u'Burmese',     'localname':u'Burmese'},
            'ext':{'name':u'Extremaduran','localname':u'Estremeñu'},
            'hif':{'name':u'Fiji Hindi',  'localname':u'Fiji Hindi'},
            'bo': {'name':u'Tibetan',     'localname':u'བོད་སྐད་'},
            'srn':{'name':u'Sranan',      'localname':u'Sranantongo'},
            'got':{'name':u'Gothic',      'localname':u'𐌲𐌿𐍄𐌹𐍃𐌺'},
            'dsb':{'name':u'Lower Sorbian','localname':u'Dolnoserbšćina'},
            'bm': {'name':u'Bambara',     'localname':u'Bamanankan'},
            'sm': {'name':u'Samoan',      'localname':u'Gagana Samoa'},
            'cdo':{'name':u'Min Dong',    'localname':u'Mìng-dĕ̤ng-ngṳ̄'},
            'chr':{'name':u'Cherokee',    'localname':u'ᏣᎳᎩ ᎧᏬᏂᎯᏍᏗ'},
            'mdf':{'name':u'Moksha',      'localname':u'Мокшень (Mokshanj Kälj)'},
            'om': {'name':u'Oromo',       'localname':u'Oromoo'},
            'ee': {'name':u'Ewe',         'localname':u'Eʋegbe'},
            'as': {'name':u'Assamese',    'localname':u'অসমীয়া ভাষা আৰু লিপি'},
            'ti': {'name':u'Tigrinya',    'localname':u'ትግርኛ_ፊደል'},
            'ug': {'name':u'Uyghur',      'localname':u'Oyghurque'},
            'kv': {'name':u'Komi',        'localname':u'Коми'},
            'zu': {'name':u'Zulu',        'localname':u'IsiZulu'},
            'av': {'name':u'Avar',        'localname':u'Авар'},
            'nv': {'name':u'Navajo',      'localname':u'Diné bizaad'},
            'ss': {'name':u'Swati',       'localname':u'SiSwati'},
            'pih':{'name':u'Norfolk',     'localname':u'Norfuk'},
            'ts': {'name':u'Tsonga',      'localname':u'Xitsonga'},
            'cr': {'name':u'Cree',        'localname':u'Nehiyaw'},
            've': {'name':u'Venda',       'localname':u'TshiVenda'},
            'ch': {'name':u'Chamorro',    'localname':u'Chamoru'},
            'bi': {'name':u'Bislama',     'localname':u'Bislama'},
            'xh': {'name':u'Xhosa',       'localname':u'IsiXhosa'},
            'rw': {'name':u'Kinyarwanda', 'localname':u'Kinyarwanda'},
            'dz': {'name':u'Dzongkha',    'localname':u'རྫོང་ཁ་'},
            'tn': {'name':u'Tswana',      'localname':u'Setswana'},
            'kl': {'name':u'Greenlandic', 'localname':u'Kalaallisut'},
            'bug':{'name':u'Buginese',    'localname':u'Basa Ugi'},
            'ik': {'name':u'Inupiak',     'localname':u'Iñupiak uqautchit'},
            'bxr':{'name':u'Buryat (Russia)','localname':u'Буряад'},
            'st': {'name':u'Sesotho',     'localname':u'Sesotho'},
            'xal':{'name':u'Kalmyk',      'localname':u'Хальмг келн'},
            'ny': {'name':u'Chichewa',    'localname':u'Chicheŵa'},
            'ak': {'name':u'Akan',        'localname':u'Akana'},
            'ab': {'name':u'Abkhazian',   'localname':u'Аҧсуа бызшәа'},
            'fj': {'name':u'Fijian',      'localname':u'Na Vosa Vakaviti'},
            'lg': {'name':u'Luganda',     'localname':u'Luganda'},
            'tw': {'name':u'Twi',         'localname':u'Twi'},
            'ha': {'name':u'Hausa',       'localname':u'هَوُسَ'},
            'za': {'name':u'Zhuang',      'localname':u'Sawcuengh'},
            'ff': {'name':u'Fula',        'localname':u'Fulfulde'},
            'lbe':{'name':u'Lak',         'localname':u'Лакку маз'},
            'ki': {'name':u'Kikuyu',      'localname':u'Gĩgĩkũyũ'},
            'sn': {'name':u'Shona',       'localname':u'ChiShona'},
            'tum':{'name':u'Tumbuka',     'localname':u'ChiTumbuka'},
            'sg': {'name':u'Sango',       'localname':u'Sängö'},
            'chy':{'name':u'Cheyenne',    'localname':u'Tsetsêhestâhese'},
            'rn': {'name':u'Kirundi',     'localname':u'Kirundi'},
            'arz':{'name':u'Egyptian Arabic', 'localname':u'مصرى (Maṣrī)', 'similar_lang':'ar'},
            'pnt':{'name':u'Pontic',          'localname':u'Ποντιακά', 'similar_lang':'el'},
            'mhr':{'name':u'Meadow Mari',     'localname':u'Олык Марий'},
            'ace':{'name':u'Acehnese',        'localname':u'Acèh'},
            'ckb':{'name':u'Soranî',          'localname':u'Soranî / کوردی'},
            'mwl':{'name':u'Mirandese',       'localname':u'Mirandés'},
            'pnb':{'name':u'Western Panjabi', 'localname':u'پنجابی'},
            'pcd':{'name':u'Picard',          'localname':u'Picard'},
            'krc':{'name':u'Karachay-Balkar', 'localname':u'Къарачай-Малкъар'},
            'frr':{'name':u'North Frisian',   'localname':u'Nordfriisk'},
            'bjn':{'name':u'Banjar',          'localname':u'Bahasa Banjar'},
            'mrj':{'name':u'Hill Mari',       'localname':u'Кырык Мары (Kyryk Mary)'},
            'koi':{'name':u'Komi-Permyak',    'localname':u'Перем Коми (Perem Komi)'},
            'gag':{'name':u'Gagauz',           'localname':u'Gagauz'},
            'pfl':{'name':u'Palatinate German','localname':u'Pfälzisch'},
            'rue':{'name':u'Rusyn',            'localname':u'русиньскый язык'},
            'ltg':{'name':u'Latgalian',        'localname':u'Latgaļu volūda'},
            'kbd':{'name':u'Kabardian',        'localname':u'Aдыгэбзэ'},
            'xmf':{'name':u'Mingrelian',       'localname':u'მარგალური'},
            'nso':{'name':u'Northern Sotho',   'localname':u'Sesotho sa Leboa'},
            'vep':{'name':u'Veps',             'localname':u'Vepsän kel\''},
            'lez':{'name':u'Lezgi',            'localname':u'Лезги'},
            'min':{'name':u'Minangkabau',      'localname':u'Minangkabau'},
            'min':{'name':u'Minangkabau',      'localname':u'Minangkabau'},
            'tyv':{'name':u'Tuva',             'localname':u'Тыва дыл'},            
            'hak':{'name':u'Hakka',            'localname':u'Hak-kâ-fa / 客家話'},
            'mai':{'name':u'Maithili',         'localname':u'मैथिली'},
            'lrc':{'name':u'Northern Luri',    'localname':u'لۊری شومالی'},
            'gom':{'name':u'Konkani',          'localname':u'कोंकणी / Konknni'},
            'ady':{'name':u'Western Adyghe',   'localname':u'адыгабзэ'},
            'azb':{'name':u'South Azerbaijani','localname':u'تۆرکجه'},
            'jam':{'name':u'Patois',           'localname':u'Jamaican Creole English'},
            'olo':{'name':u'Livvi-Karelian',   'localname':u'Livvinkarjala'},
            'tcy':{'name':u'Tulu',             'localname':u'ತುಳು'},
	    'kbp':{'name':u'Kabiye',           'localname':u'Kabɩyɛ'},
            'atj':{'name':u'Atikamekw',        'localname':u'Atikamekw'},
            'dty':{'name':u'Doteli',           'localname':u'डोटेली'},
            'inh':{'name':u'Ingush',           'localname':u'Гӏалгӏай'},
            'gor':{'name':u'Gorontalo',        'localname':u'Hulontalo'},
            'lfn':{'name':u'Lingua Franca Nova', 'localname':u'Lingua Franca Nova'},
	    'sat':{'name':u'Santali',          'localname':u'ᱥᱟᱱᱛᱟᱲᱤ'}}


#languages to process
#lang_keys = ['sv', 'ca', 'ru', 'en', 'de', 'fr', 'es', 'eo', 'gl']
#lang_keys = ['en', 'ru', 'mai']
lang_keys = lang_info.keys()
lang_keys.sort()

#optimize by caching stuff
iw_cache = {}
en_labels = {}
item_list = []
disambigs = []

#debug
max_words = -1 

prev_score = {}

#score colors
color10000 = 'BF5FFF'
color4000 = 'FF7F00'
color2000 = 'FFBE00'
color1000 = 'FFFF00'
color500  = 'BEFF00' 
color250  = '40FF00'
color100  = '00FF7D'
color0    = 'EFEFEF'

item_list_path = "ExpandedItemList.txt"


def ListOfArticlesExpanded():
    article_prefix = 'List of articles every Wikipedia should have/Expanded/'
    article_suffixes = ['People','History','Geography','Arts','Philosophy and religion','Anthropology, psychology and everyday life','Society and social sciences','Biology and health sciences','Physical sciences','Technology','Mathematics']
    #article_suffixes = ['Measurement']
    pages = []
    meta_wiki = pywikibot.Site('meta', 'meta')
    for suffix in article_suffixes:
        article_name = article_prefix + suffix
        meta_page = pywikibot.Page(meta_wiki, article_name)
        pages.append(meta_page)
    return pages

def LoadItemList():
    item_path = item_list_path
    if os.path.isfile(item_path):
        return
    f = open(item_path, 'w')
    count = 0
    grand_total = 0
    pages = ListOfArticlesExpanded()
    for meta_page in pages:
        article   = meta_page.get(get_redirect=False)
        name_last  = 0
        name_first = article.find(u'[[d:', name_last)
        while name_first > -1:
            name_mid  = article.find(u'|',  name_first)
         
         
            cat_start =article.rfind(u'\n== ', name_last, name_first)
            if cat_start > -1:
                cat_end   = article.find(u'==',cat_start+3, name_first)
                if cat_end > -1: 
                    cat   = article[cat_start+3:cat_end]
                    catName = ''.center(len(cat),'-')
                    pywikibot.output(u'\n%s' % cat)
                    pywikibot.output(u'\n%s' % catName)
                    count = 0
         
            name_last = article.find(u']]', name_first)
            if name_last > name_mid:
                name_last = name_mid
            article_item = article[name_first+4:name_last]
            f.write(article_item.encode("utf_8"))
            f.write('\n')
            count += 1
            grand_total += 1
            pywikibot.output(u'%d %s' % (count,article_item))
            name_first = article.find(u'[[d:', name_last)
         
    f.close()
 
    pywikibot.output(u'\nGRAND TOTAL\n-----------\n%d articles' % (grand_total))

def GetItemList():
    LoadItemList()
    count = 0
    item_file = open(item_list_path, 'r')
    for line in item_file:
        item = line[:-1].decode('utf_8')
        if item in item_list:
            errortext = item + " twice in list\n"
            pywikibot.output(errortext)
            with open("errorlog.txt", "a") as errorlog:
                errorlog.write(errortext)
        else:
            item_list.append(item)
    item_file.close()

def GetManyIws(itemlist):
    pipedword = '|'.join(itemlist)
    wiki = pywikibot.Site('wikidata', 'wikidata')
    #params = {
    #    'action'    :'query',
    #    'prop'      :'revisions',
    #    'redirects' :True,
    #    'titles'    :pipedword,
    #    'rvprop'    :'content'
    #    }
    #queryresult = query.GetData(params, site=wiki)
    pageRequest = api.Request(action="query", prop="revisions", redirects=True, titles=pipedword, rvprop="content", rvslots="main", site=wiki)
    queryresult = pageRequest.submit()
    #pywikibot.output(queryresult)
    pages = queryresult[u'query'][u'pages']
    word_text = {}
    newitemlist = Set()
    for k, v in pages.iteritems():
        #print 'k ', k
        #print 'v ', v
        item = v[u'title']
        newitemlist.add(item)
        if item not in itemlist:
            print 'not in ', item
            item_list.append(item)
            errortext = item + " is redirected to.\n"
            with open("errorlog.txt", "a") as errorlog:
                errorlog.write(errortext)
        try:
            pagetext=v[u'revisions'][0][u'slots'][u'main'][u'*']
            #print pagetext;
        except:
            errortext = item + " has no wikidata item\n"
            if item in item_list:
                item_list.remove(item)
            pywikibot.output(errortext)
            with open("errorlog.txt", "a") as errorlog:
                errorlog.write(errortext)
        data_dict = json.loads(pagetext)
        #print "dd ", data_dict
        try:
            iw_link_info = data_dict[u'sitelinks']
        except:
            iw_link_info = data_dict[u'links']
        iw_links = {}
        #print "ili ", iw_link_info
        print item
        try:
            for linkkey, linkvalue in iw_link_info.iteritems():
                #print "lk: ", linkkey, " lv: ", linkvalue
                iw_links[linkkey] = linkvalue[u'title']
        except:
            errortext = item + " has no links\n"
            if item in item_list:
                item_list.remove(item)
            pywikibot.output(errortext)
            with open("errorlog.txt", "a") as errorlog:
                errorlog.write(errortext)
        try:
            labels = data_dict[u'labels']
            if u'en' in labels:
                en_labels[item] = labels[u'en'][u'value']
            else:
                en_labels[item] = item
        except:
            labels = data_dict[u'label']
            if u'en' in labels:
                en_labels[item] = labels[u'en']
            else:
                en_labels[item] = item
        iw_cache[item] = iw_links
        try:
            word_text[v[u'title']]=en_labels[item]
        except:
            word_text[v[u'title']] = item
        if u'P31' in data_dict[u'claims']:
            for claim in data_dict[u'claims'][u'P31']:
                if claim[u'mainsnak'][u'datavalue'][u'value'][u'numeric-id'] == 4167410:
                    disambigs.append(item)
    pywikibot.output(str(word_text.values()))
    redir_items = [x for x in itemlist if x not in newitemlist]
    #print 'redirected items:', redir_items
    for redir_item in redir_items:
        item_list.remove(redir_item)
        errortext = redir_item + " is redirected from.\n"
        with open("errorlog.txt", "a") as errorlog:
            errorlog.write(errortext)
    #print 'il', item_list
    return word_text

def GetIwLinks():  
    iw_link_path = "IwLinks.json"
    en_label_path = "Labels.json"
    global iw_cache
    global en_labels
    if os.path.isfile(iw_link_path):
        iwf = open(iw_link_path, 'r')
        iw_cache = json.load(iwf)
        iwf.close()
        enf = open(en_label_path, 'r')
        en_labels = json.load(enf)
        enf.close()
        return
    textdict = {}
    article_group = []
    item_file = open(item_list_path)
    for line in item_file:
            item = line[:-1].decode('utf_8')
            article_group.append(item)
            if len(article_group) == 50:
                textdict.update(GetManyIws(article_group))
                article_group = []
    if len(article_group) > 0:
        textdict.update(GetManyIws(article_group))
        article_group = []
    item_file.close()
    iwf = open('IwLinks.json', 'w')
    json.dump(iw_cache, iwf)
    iwf.close()
    enf = open('Labels.json', 'w')
    json.dump(en_labels, enf)
    enf.close()
    return "klart"
 
#format with spaces
def FormatNumber(s):
    r = []
    for i, c in enumerate(reversed(str(int(s)))):
        if i and i % 3 == 0:
            r.insert(0, ',')
        r.insert(0, c)
    return ''.join(r)


def GetPreviousScores():

    temp_path = "PreviousScores.txt"  
    if os.path.isfile(temp_path):
       temp_file = open(temp_path)
       for line in temp_file:
            tokens = line.split()
            prev_score[tokens[0]] = float(tokens[1])
       temp_file.close()
    

def GetArticle(item, wiki, lang):
    word = GetArticleInterwikiName(item, lang)
        
    if len(word) > 0:
        page = pywikibot.Page(wiki, word)
        article = page.get(get_redirect=True)

        #if u'#REDIRECT' in article.upper():
        #    text_start = article.find('[[')
        #    text_end = article.find(']]', text_start)
        #    word = article[text_start+2:text_end]
        #    page = pywikibot.Page(wiki, word)
        #    article = page.get()
    else:
        article = ''

    return article


def GetArticleInterwikiName(item, lang):

    if item in iw_cache:
        iw_links = iw_cache[item]
    else:
        wikidata = pywikibot.Site('wikidata', 'wikidata')
        try:
            datapage = pywikibot.DataPage(wikidata, item)
            data_dict = datapage.get()
        except:
             print('Where is ' + item)
             return ''
            
        iw_links = data_dict[u'links']
        labels = data_dict[u'label']
        iw_cache[item] = iw_links
        if u'en' in labels:
            en_labels[item] = labels[u'en']
        else:
            en_labels[item] = u''
    lang_wiki = lang.replace("-","_") + u'wiki'

    if lang_wiki in iw_links:
        try:
            local_name = iw_links[lang_wiki][u'name']
        except:
            local_name = iw_links[lang_wiki]
        return local_name
    else:
        return ''


def GetInterwikiLength(article):
    
    #calculate len of all interwiki links
    interwiki_len   = 0
    interwiki_last  = 0
    interwiki_colon = 0
    interwiki_nl    = 0
    interwiki_first = article.find(u'[[', interwiki_last)
    while interwiki_first > -1:    
        interwiki_last  = article.find(u']]', interwiki_first)
        interwiki_colon = article.find(u':',  interwiki_first)
        if interwiki_colon > -1 and interwiki_colon < interwiki_last:
           curlang = article[interwiki_first+2:interwiki_colon]
           if curlang in lang_info:
               interwiki_nl = article.find(u'\n', interwiki_last)
               if interwiki_nl > -1:
                  interwiki_len += (interwiki_nl - interwiki_first) + 1
               else:
                  interwiki_len += (interwiki_last - interwiki_first) + 2
        interwiki_first = article.find(u'[[', interwiki_last)

    return interwiki_len


def GetCommentLength(article):

    #calculate len of all comments
    comment_len   = 0
    comment_last  = 0
    comment_first = article.find(u'<!--', comment_last)
    while comment_first > -1:    
        comment_last = article.find(u'-->', comment_first)
        if comment_last == -1:
           comment_last = comment_first + 4
            
        comment_len += (comment_last - comment_first) - 4
        comment_first = article.find(u'<!--', comment_last)

    return comment_len

def IsArticleEnglish(article):

    #remove comments
    comments = re.compile(r'<!--(.|\n|\r)*?-->')
    article = comments.sub("", article)

    #remove references
    refs = re.compile(r'<ref(.|\n|\r)*?</ref>')
    article = refs.sub("", article)

    # convert article to lower case word list
    word_list = article.lower().split()

    if len(word_list) == 0:
        return False

    # create dictionary of word:frequency pairs
    freq_dic = {}

    # punctuation marks to be removed
    punctuation = re.compile(r'[.?!,":;]') 
    for word in word_list:
        word = punctuation.sub("", word)
        if word in freq_dic: 
            freq_dic[word] += 1
        else: 
            freq_dic[word] = 1

    # usually English is ~30% these words and non-English at most a few percent
    common_english_words = ['the','of','on','a','is','in','his','have','by','but','that','to','with','for',
                            'an','from''are','was','he','which','be','as','it','this','first', 'new', 'and',
                            'she','also','after','at','become','best','from','had','great', 'into','their',
                            'these','they','time','who','her','not','one','or', 'made', 'would','are','between']
    en_word_count = 0
    for word in common_english_words:
        if word in freq_dic:
            en_word_count += freq_dic[word]

    percent_thats_common_english = 100.0 * en_word_count / len(word_list)

    # flag if 20% or more in the list which means more than half the article is English 
    if percent_thats_common_english > 20 and  en_word_count > 20:
        print "Percent %f, %d out of %d" % (percent_thats_common_english, en_word_count, len(word_list))
        return True
    return False



def GetArticleType(wt_article_size):
   if wt_article_size < 0:
      pywikibot.output('negative size!')
      return 'stubs'
   if wt_article_size == 0:
      return 'absent'
   elif 0 < wt_article_size < 8000:
      return 'stubs'
   elif 8000 <= wt_article_size < 16000:
      return 'articles'
   elif wt_article_size >= 16000:
      return 'longarticles'

def GetScoreForLang(lang):
    absent       = lang_info[lang]['absent']
    stubs        = lang_info[lang]['stubs']
    articles     = lang_info[lang]['articles']
    longarticles = lang_info[lang]['longarticles']
    return GetScore(absent, stubs, articles, longarticles)

def GetScore(absent, stubs, articles, longarticles):
    max_count = absent + stubs + articles + longarticles
    max_score = max_count * 4
    raw_score = (stubs*2) + (articles*3) + (longarticles*4)
    if max_score > 0:
        score = 100.0 * raw_score / max_score
    else:
        score = 0
    return score

def GetLink(subtable,lang,value):
    return '[[/'+subtable+'#' + lang +' '+lang_info[lang]['localname']+ '|' + value + ']]'

def GetTableNumber(count, min_subtable_count, max_subtable_count0, subtable, lang, max_subtable_count40=0):
    value = FormatNumber(count)

    max_subtable_count = max_subtable_count0
    if GetScoreForLang(lang) > 40 and max_subtable_count40 > 0:
        max_subtable_count = max_subtable_count40

    if count >= min_subtable_count and (count <= max_subtable_count or max_subtable_count==-1):
       return GetLink(subtable,lang,value)
    else:
       return value


num_lang = 0

def CalculateStatistics():
    for lang in lang_keys:
        CalculateStatisticsForLang(lang)

def GetWeightForLang(lang):
    lang_weight = 1.0
    if 'weight' in lang_info[lang]:
        lang_weight = lang_info[lang]['weight']
    elif 'similar_lang' in lang_info[lang]:
        lang_weight = lang_info[lang_info[lang]['similar_lang']]['weight']
    return lang_weight

def GetManyArticles(lang, wordlist):
    #print wordlist
    pipedword = '|'.join(wordlist.values())
    wiki = pywikibot.Site(lang, 'wikipedia')
    pageRequest = api.Request(action="query", prop="revisions", redirects=True, titles=pipedword, rvprop="content", rvslots="main", site=wiki)
    #params = {
    #    'action'    :'query',
    #    'prop'      :'revisions',
    #    'redirects' :True,
    #    'titles'    :pipedword,
    #    'rvprop'    :'content',
    #    }
    item_text = {}
    second_try = {}
    try:
        queryresult = pageRequest.submit()
        #queryresult = query.GetData(params, site=wiki)
        redirects = {}
        if queryresult[u'query'].has_key(u'redirects'):
            for redirpair in queryresult[u'query'][u'redirects']:
                redirects[redirpair[u'from']] = redirpair[u'to']
            pywikibot.output(str(redirects))
        pages = queryresult[u'query'][u'pages']
        word_text = {}
        for k, v in pages.iteritems():
            try:
                word_text[v[u'title']]=v[u'revisions'][0][u'slots'][u'main'][u'*']
            except:
                word_text[v[u'title']]=u''
        for k, v in wordlist.iteritems():
            #if redirects.has_key(v):
            #    word = redirects[v]
            #else: 
            word = v
            try:
                item_text[k] = word_text[word]
            except:
                pywikibot.output(word)
                second_try[k] = word
        pywikibot.output(str(item_text.keys()))
    except:
        second_try = wordlist
    if len(second_try)>0:
        if len(second_try)<len(wordlist):
            item_text.update(GetManyArticles(lang, second_try))
        elif len(second_try)>1:
            for k, v in second_try.iteritems():
                one_item = {}
                one_item[k] = v
                item_text.update(GetManyArticles(lang, one_item))
        else:
            for k, v in second_try.iteritems():
                item_text[k] = u''
                pywikibot.output(u'Error getting: ' + k + u' ' + v)
    return item_text

def GetArticleTexts(lang):  
    textdict = {}
    article_group = {}
#    item_file = open(item_list_path)
#    for index, line in enumerate(item_file):
#        item = line[:-1].decode('utf_8')
    for item in item_list:
        word            = GetArticleInterwikiName(item, lang)
        if (word == ''):
            textdict[item] = ''
        else:
            article_group[item]=word
            if len(article_group) == 50:
#                print article_group
                textdict.update(GetManyArticles(lang, article_group))
                article_group.clear()
                pywikibot.output(lang +u' '+ str(len(textdict)))
    if len(article_group) > 0:
        textdict.update(GetManyArticles(lang, article_group))
        article_group.clear()
#    item_file.close()
    return textdict
    
def CalculateStatisticsForLang(lang):

    global num_lang
    num_lang += 1

    print ('=['+lang+' '+str(num_lang)+ '/' + str(len(lang_keys)) + ']').ljust(76,'=')

    try:

        lang_info[lang]['total_size']   = 0
        lang_info[lang]['absent']       = 0
        lang_info[lang]['stubs']        = 0
        lang_info[lang]['articles']     = 0
        lang_info[lang]['longarticles'] = 0

        lang_info[lang]['art_count']    = 0

        temp_path = "~%s_output.txt" % (lang)
        if os.path.isfile(temp_path):

            temp_file = open(temp_path)

            art_count = int(temp_file.readline())
            lang_info[lang]['art_count']    = art_count  
            for index in range(art_count):
                artKey = 'art_'+str(index)
                lang_info[lang][artKey] = {}
                lang_info[lang][artKey]['item']  = temp_file.readline().decode('utf_8').strip()
                lang_info[lang][artKey]['name']  = temp_file.readline().decode('utf_8').strip()
                linetext = temp_file.readline()
                try:
                    lang_info[lang][artKey]['size']  = int(linetext)
                except:
                    print index, lang_info[lang][artKey]['item'], lang_info[lang][artKey]['name']
                    lang_info[lang][artKey]['size'] = 0
                lang_info[lang][artKey]['error'] = temp_file.readline().decode('utf_8').strip()
            
            temp_file.close()

            print '..using previous %s result...' % (lang)

        else:        

            wiki = pywikibot.Site(lang, 'wikipedia')

            textdict = GetArticleTexts(lang)
            word_count = 0

            for item, article in textdict.iteritems():

                word_count += 1
                if word_count > max_words > 0:
                    break

                article_size = 0
                error = ''
                
                try:
                    raw_article_size = len(article)

                    interwiki_len = GetInterwikiLength(article)
                    comment_len   = GetCommentLength(article)
                    article_size  = (raw_article_size - interwiki_len - comment_len)

                    if lang != "en" and lang != 'simple' and lang != 'sco' and IsArticleEnglish(article):
                        raise TypeError ("Wrong language, [[%s:%s]] has too much untranslated English." % (lang, GetArticleInterwikiName(item, lang).encode("utf-8")))
                    lang_weight = GetWeightForLang(lang)
                    print str(lang).ljust(3), str(word_count).rjust(3), item.ljust(30),
                    print str(article_size * lang_weight).rjust(11), str(lang_weight).rjust(5), str(interwiki_len).rjust(9), str(comment_len).rjust(9)

                except KeyboardInterrupt:
                    sys.exit(1)

                except Exception:
                    e = sys.exc_info()[1]
                    sys.stderr.write('\n')
                    traceback.print_exc()
                    sys.stderr.write('\n')
                    try:
                        error = CookString(unicode(str(e),'utf-8'))
                    except:
                        error = "Error."

                art_index = item_list.index(item)
                artKey = 'art_'+str(art_index)
                lang_info[lang][artKey] = {}
                lang_info[lang][artKey]['item'] = item
                if item in en_labels:
                    lang_info[lang][artKey]['name'] = en_labels[item]
                else:
                    lang_info[lang][artKey]['name'] = item
                lang_info[lang][artKey]['size'] = article_size
                lang_info[lang][artKey]['error'] = error
                lang_info[lang]['art_count'] = lang_info[lang]['art_count'] + 1  


            temp_file = open(temp_path,'w')
            temp_file.write(str(lang_info[lang]['art_count'])+'\n')
            for index in range(lang_info[lang]['art_count']):
                artKey = 'art_'+str(index)
                temp_file.write(lang_info[lang][artKey]['item'].encode('utf_8')+'\n')
                temp_file.write(lang_info[lang][artKey]['name'].encode('utf_8')+'\n')
                temp_file.write(str(lang_info[lang][artKey]['size'])+'\n')
                temp_file.write(lang_info[lang][artKey]['error'].encode('utf_8')+'\n')
            temp_file.close()

        for index in range(lang_info[lang]['art_count']):
            artKey = 'art_'+str(index)
            article_size    = lang_info[lang][artKey]['size']
            wt_article_size = article_size * GetWeightForLang(lang)
            article_type    = GetArticleType(wt_article_size)
            if not lang_info[lang][artKey]['error']:
                lang_info[lang][article_type] = lang_info[lang][article_type] + 1
                lang_info[lang]['total_size'] = lang_info[lang]['total_size'] + article_size

    except:
        sys.stderr.write('\n')
        traceback.print_exc()
        sys.stderr.write('\n')



def GetGrowthNumber(lang, score):
    if lang in prev_score:
        return score - prev_score[lang]

def GetGrowth(lang, score):
    if lang in prev_score:
       growth    = "%+2.2f" % round(GetGrowthNumber(lang, score),2)
    else:
       growth    = "n/a"
    if growth == '-0.00':
       growth = '+0.00' 
    return growth


def GetAverageSize(lang, article_count):
    if article_count > 0:
       avg_size = int(round(lang_info[lang]['total_size'] / article_count))
    else:
       avg_size = 0
    return int(avg_size * GetWeightForLang(lang))



def GetMedianSize(lang):
    x = []
    art_count = lang_info[lang]['art_count']  
    for index in range(art_count):
        artKey = 'art_'+str(index)
        size = lang_info[lang][artKey]['size']
        if size > 0:
            x.append(size)
    x.sort()
    mid = len(x)/2

    median_size = 0
    if len(x) > 0:
        if len(x) % 2:
            median_size = x[mid]    
        else:
            median_size = (x[mid-1] + x[mid]) / 2
    return int(median_size * GetWeightForLang(lang))

def PrintResults():

    lang_keys.sort(key=GetScoreForLang, reverse=True)

    print '\n'
    print 'RESULTS\n----------------------------------------------------------------------'
    print u'Lang:',' AvgSize','Median','Absent','   <8k ','8-16k','  >16k ', 'Score', 'Growth'
    for lang in lang_keys:

        absent        = lang_info[lang]['absent']
        stubs         = lang_info[lang]['stubs']
        articles      = lang_info[lang]['articles']
        longarticles  = lang_info[lang]['longarticles']

        article_count = stubs + articles + longarticles
        score         = GetScore(absent, stubs, articles, longarticles)
        growth        = GetGrowth(lang, score)
        avg_size      = GetAverageSize(lang, article_count)
        med_size      = GetMedianSize(lang)

        print lang.ljust(6),
        print str(avg_size).rjust(7),
        print str(med_size).rjust(7),
        print str(absent).rjust(5),
        print str(stubs).rjust(6),
        print str(articles).rjust(6),
        print str(longarticles).rjust(6),
        print ("%6.2f" % score).rjust(6),
        print growth.rjust(6)

def GetWikiTableResults():

    lang_keys.sort(key=GetScoreForLang, reverse=True)

    table = u'This list of Wikipedias is based on the [[List of articles every Wikipedia should have/Expanded]] as a sample. The list currently has 10000 articles.  For every Wikipedia, the articles in this sample list are retrieved (based on interwiki links from Wikidata) and the number of characters is calculated (minus "comments" and the "interwiki" text at the bottom of the article). The size of each article is then adjusted for each language by multiplying it by the language weight. The articles are divided in four classes:' 
    table += u'\n'
    table += u'* "absent" (i.e. non-existing; size = 0),'
    table += u'\n'
    table += u'* "stubs" (weighted \'\'size in characters\'\' less than 8 000),'
    table += u'\n'
    table += u'* "articles" (size between 8 000 and 16 000),'
    table += u'\n'
    table += u'* "long articles" (size more than 16 000).'
    table += u'\n'
    table += u'The average and mean weighted sizes of all the non-absent articles in the sample are also calculated. Finally, a score is computed, based on the following formula:'
    table += u'\n\n'
    table += u'<code> rawscore = stubs*2 + articles*3 + long_articles*4</code>.'
    table += u'\n\n'
    table += u'In order to have a consistent scale, the raw score is normalized by dividing by the maximum score and multiplying by 100. The maximum score is <code> maxscore = total_items*4</code>, where total_items ≤ 10000. The final score is then'
    table += u'\n\n'
    table += u'<code>score = rawscore / (total_items * 0.04)</code>.'
    table += u'\n\n'
    table += u'The language editions are then listed in order of decreasing score. The shortest articles for major Wikipedias are in [[List of Wikipedias by expanded sample of articles/Shortest]].'
    table += u'\n\n'
    table += u'{{lists of Wikipedias}}'
    table += u'\n\n'
    table += u'== List =='
    table += u'\n'
    table += u'Last Update: ' + date.today().strftime('%-d %B %Y') + '.'
    table += u'\n\n'
    table += u'{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
    table += u'\n|-\n'
    table += u'!width = 45 | № !! width = 55 | Wiki !! width = 220 | Language !! width = 55 | [[Talk:List of Wikipedias by sample of articles/Archives/2011#One thought about weights|Weight]] !! width = 120 | Mean Article<br>Size !! width = 120 | [[Talk:List_of_Wikipedias_by_sample_of_articles#average_or_median.3F|Median Article<br>Size]] !! width = 80 | [[/Shortest|Absent]]<br>(0k) !! width=80| Stubs<br>(< 8k)!! width = 80 | Articles<br>(8-16k) !! width = 80 | Long Art.<br>(> 16k) !! width = 80 | Score'
    table += u'!! width = 50 | Growth'
    table += u'\n|-\n'
    i=0
    for lang in lang_keys:
        i += 1

        absent        = lang_info[lang]['absent']
        stubs         = lang_info[lang]['stubs']
        articles      = lang_info[lang]['articles']
        longarticles  = lang_info[lang]['longarticles']

        article_count = stubs + articles + longarticles

        dagger = u'†'
        if absent + article_count == 0:
            lang_footnote = dagger
            absent = lang_info['en']['art_count']
        else:
            lang_footnote = ''

        table += '|' + str(i) + '\n'
        table += '| [[:' + lang + ':|' + lang + ']]' + lang_footnote + '\n'
        table += '| style = "text-align: left" | [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]\n'

        if 'weight' in lang_info[lang]:
            weight = str(lang_info[lang]['weight'])
        elif 'similar_lang' in lang_info[lang]:
            weight = str(lang_info[lang_info[lang]['similar_lang']]['weight']) + '**' 
        else:
            weight = '1.0*' 

        score         = GetScore(absent, stubs, articles, longarticles)
        growth        = GetGrowth(lang, score)
        avg_size      = GetAverageSize(lang, article_count)
        med_size      = GetMedianSize(lang)


        #if HasAwards(awards, lang):
            #growth = GetLink('Growth',lang, growth)

        table += '| ' + weight + '\n'
        table += '| ' + FormatNumber(avg_size) + '\n'
        table += '| ' + FormatNumber(med_size) + '\n'
        table += '| ' + GetTableNumber(absent,       1, 1000,'Shortest', lang, 3000) + '\n'
        table += '| ' + GetTableNumber(stubs,        1, 0,'Shortest',           lang, 1000) + '\n'
        table += '| ' + GetTableNumber(articles,     1, 0,'Articles',        lang, 0) + '\n'
        table += '| ' + GetTableNumber(longarticles, 1, 0,'Long Articles',   lang, 0) + '\n'

        #color code score
        if score >= 100.00:    
            color = "|style = \"background: "+u'\u0023'+color10000+"\""
        elif score >= 40.00:    
            color = "|style = \"background: "+u'\u0023'+color4000+"\""
        elif score >= 20.00:
            color = "|style = \"background: "+u'\u0023'+color2000+"\""
        elif score >= 10.00:
            color = "|style = \"background: "+u'\u0023'+color1000+"\""
        elif score >= 5.00:
            color = "|style = \"background: "+u'\u0023'+color500+"\""
        elif score >= 2.50:
            color = "|style = \"background: "+u'\u0023'+color250+"\""
        elif score >= 1.00:
            color = "|style = \"background: "+u'\u0023'+color100+"\""
        else:
            color = "|style = \"background: "+u'\u0023'+color0+"\""
            
        table += color + '| ' + ("%.2f" % score) + '\n'
        table += '| ' + growth + '\n'
        table += '|-\n'
        
    table = table[:-2] + '}\n'
    table += '\n[[Category:Lists of Wikipedias|Expanded sample]]'
    return table

def GetWikiTableArticles(article_type, min_articles, max_articles_0, max_articles_40=0):
    lang_keys.sort()
    table = u''
    i=0
    for lang in lang_keys:
        i += 1
        count=0

        max_articles = max_articles_0
        score = GetScoreForLang(lang)
        if score > 40 and max_articles_40 > 0:
            max_articles = max_articles_40

        section = u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
        for index in range(lang_info[lang]['art_count']):
            artKey  = 'art_'+str(index)
            artWtSize = GetArticleSize(lang, artKey)
            artType = GetArticleType(artWtSize)
            if artType == article_type:
               section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
               count += 1
        if min_articles <= count <= max_articles:
            table += section

    return table


def GetArticleName(lang, artKey):

    if artKey in lang_info[lang]:
       return lang_info[lang][artKey]['name']
    else:
       return 0


def GetArticleSize(lang, artKey):

    if artKey in lang_info[lang]:
       if lang_info[lang][artKey]['error'] :
          return 0;
       return lang_info[lang][artKey]['size'] * GetWeightForLang(lang)
    else:
       return 0

def GetEdgeFactor(lang, artKey):

    size = GetArticleSize(lang, artKey)
    if size==0:
        return 1
    if 7000 < size < 1000:
        return (size - 7000) / 1000
    if 24000 < size < 30000:
        return (size - 24000) / 1000
    else:
        return 0

def GetRuntFactor(lang, artKey):

    size = GetArticleSize(lang, artKey)

    if size > 0:
        for index in range(lang_info['en']['art_count']):
            otherArtKey =  'art_'+str(index)
            if otherArtKey != artKey:
               otherSize = GetArticleSize(lang, otherArtKey)
               if 0 < otherSize < size:
                   return 0 #you are not the runt
        return 4
    return 0


def GetArticlePoints(lang, artKey):

    size = GetArticleSize(lang, artKey)
    if size > 0 and size < 10000:
       return 1
    elif size > 10000 and size < 30000:
       return 4
    elif size > 30000:
       return 9
    return 0
    

def GetAverageArticlePoints(artKey):
    total = sum(GetArticlePoints(lang, artKey) for lang in lang_keys)
    return float(total) / len(lang_keys)


def GetAverageArticleSize(artKey):
    total = sum(GetArticleSize(lang, artKey) for lang in lang_keys)
    return int(float(total) / len(lang_keys))


def GetNeglectForArticle(lang, artInfo):
    artKey = artInfo['artKey']
    avgPnts = GetAverageArticlePoints(artKey) #0 to 9
    pnts = GetArticlePoints(lang, artKey)     #0 to 9
    edgeFactor = GetEdgeFactor(lang, artKey)  #0 to 6
    runtFactor = GetRuntFactor(lang, artKey)  #0 to 4
    return avgPnts - pnts + edgeFactor + runtFactor

def GetArticlesSortedByNeglect(lang):
    artInfos = []

    if 'art_count' in lang_info['en']:
      for index in range(lang_info['en']['art_count']):
        artKey =  'art_'+str(index)
        artInfos.append( {} )
        artInfos[index]['artKey']  = artKey
        artInfos[index]['popularity']  = GetAverageArticleSize(artKey)
        artInfos[index]['neglect'] = GetNeglectForArticle(lang, artInfos[index])
    artInfos.sort(key=lambda x: (x['neglect'], x['popularity']), reverse=True)
    return artInfos

def GetLargestArticles(artKey, maxLangs):
        
        lang_keys = lang_info.keys()
        lang_keys.sort(key=lambda lang: GetArticleSize(lang, artKey), reverse=True)
        
        item = lang_info['en'][artKey]['item']
        
        ret = []
        for lang in lang_keys[0:maxLangs]:
           ret.append ( '[['+lang+':'+GetArticleInterwikiName(item, lang)+'|'+lang+':'+FormatNumber(GetArticleSize(lang, artKey))+']]' )
        return ' -- '.join(ret)

def GetArticleTypeCount(artKey,points):

        return len([lang for lang in lang_keys if GetArticlePoints(lang, artKey) == points])


def GetNeglectedArticles(lang, max_articles):

    artInfos = GetArticlesSortedByNeglect(lang)

    i=0
    table = u''
    for artInfo in artInfos:

       if artInfo['artKey'] in lang_info[lang]:
        
           item = lang_info[lang][artInfo['artKey']]['item']
           name = lang_info[lang][artInfo['artKey']]['name']
           table += '#[[d:'+item+'|'+name+']]'

           size = int(GetArticleSize(lang, artInfo['artKey']))
           if size > 0:
               iw_name = GetArticleInterwikiName(item, lang)
               if iw_name == '':
                   table += ' ('+str(size) + ')'
               else:
                   iw_link = lang+':'+iw_name
                   table += ' ([['+iw_link+'|'+str(size)+']])'

           table += '\n'
       
       i+=1
       if i >= max_articles: break

    return table


def GetPopularArticles(max_articles):

    artInfos = GetArticlesSortedByNeglect('en')
    artInfos.sort(key=lambda x: x['popularity'], reverse=True)

    i=0

    table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
    table += '\n|-\n'
    table += u'!width = 45 | № !! width = 90 | Average Size !! width = 150 | Article Name !! width = 80 | [[Talk:List of Wikipedias by sample of articles#Article metric|Absent<br>(0k)]] !! width=80| Stubs<br>(< 10k)!! width = 80 | Articles<br>(10-30k) !! width = 80 | Long Art.<br>(> 30k) !! width = 150 | Largest Articles\n'

    for artInfo in artInfos:
       i+=1
       artKey = artInfo['artKey']
       table += '|-\n'
       table += '|' + str(i)
       table += '||'+FormatNumber(artInfo['popularity']) 
       table += '||style="text-align:left"|[[d:'+lang_info['en'][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']]' 
       table += '||'+str(GetArticleTypeCount(artKey,0))
       table += '||'+str(GetArticleTypeCount(artKey,1))
       table += '||'+str(GetArticleTypeCount(artKey,4))
       table += '||'+str(GetArticleTypeCount(artKey,9))
       table += '||'+GetLargestArticles(artKey,4)+'\n'
       if i >= max_articles > 0: break

    table += '|}\n'

    return table


def GetWikiNeglectedArticles():
    lang_keys.sort()
    table = u''

    print 'writing Popular Articles...'
    table += u'==Popular Articles==\n'
    table += GetPopularArticles(-1)

    print 'writing Neglected Articles...'
    table += u'==Neglected Articles==\n'
    for lang in lang_keys:
        print ' '+lang

        if lang_info[lang]['art_count'] > 0:
            table += u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
            table += GetNeglectedArticles(lang, 10)

        has_errors = False
        section = u'====Errors====\n'
        for index in range(lang_info[lang]['art_count']):
            artKey  = 'art_'+str(index)
            if lang_info[lang][artKey]['error']  :
               section = section + '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
               has_errors = True
        if has_errors:
            table = table + section
       
    return table

def SaveWikiTableResults(awards):

    print 'writing Results...'
    f = open('results.txt', 'w')
    f.write(GetWikiTableResults(awards).encode("utf_8"))
    f.close()

    print 'writing Absent...'
    f = open('_absent.txt', 'w')
    f.write(GetWikiTableArticles('absent',1, 250).encode("utf_8"))
    f.close()

    print 'writing Stubs...'
    f = open('_stub.txt', 'w')
    f.write(GetWikiTableArticles('stubs',1, 100, 250).encode("utf_8"))
    f.close()

    print 'writing Articles...'
    f = open('_articles.txt', 'w')
    f.write(GetWikiTableArticles('articles',1, 100, 250).encode("utf_8"))
    f.close()

    print 'writing Long Articles...'
    f = open('_longarticles.txt', 'w')
    f.write(GetWikiTableArticles('longarticles',1,100).encode("utf_8"))
    f.close()

    print 'writing Awards...'
    f = open('_growth.txt', 'w')
    f.write(GetWikiAwards(awards).encode("utf_8"))
    f.close()

    print 'writing Suggestions...'
    f = open('_neglectedarticles.txt', 'w')
    f.write(GetWikiNeglectedArticles().encode("utf_8"))
    f.close()
    
    

def CookString(rawString):

    cookString = ''
    for part in rawString.replace("'","||").split("|"):
        if len(part)==0:
            cookString += "'"
        else:
            cookString += eval("u'"+part+"'")
    return cookString        
        
def GetGrowths(article):
    growths = {}
    lang_last  = 0
    lang_first = article.find(u'[[:', lang_last)
    while lang_first > -1:
        lang_last  = article.find(u'|',  lang_first)
        if lang_last == -1:
            break
        lang = article[lang_first+3:lang_last-1]
        score_first = article.find(u'style = "background:',lang_last)
        if score_first == -1:
            break
        score_last  = article.find(u'|', score_first+32)
        if score_last == -1:
            break
        growth_end = article.find(u'\n', score_last)
        growth_str = article[score_last+2:growth_end]
        try:
           growth_pipe = growth_str.find(u'|') 
           if growth_pipe > -1:
              growth_str = growth_str[growth_pipe+1:-2]
           if growth_str.find(u' ‡') > -1:
              growth_str = growth_str[0:-2]
           growth = float(growth_str)
        except:
           growth = 0 
        growths[lang]=growth
        lang_first = article.find(u'[[:', score_last)
    return growths

def GetLastUpdated(article):
    date_first = article.find(u'Last Update')
    if date_first > -1:
       date_last_paren = article.find(u'(', date_first)
       date_last_br = article.find(u'<br/>', date_first)

       if date_last_paren > -1 and date_last_paren < date_last_br :
          date_last = date_last_paren
       else:
          date_last = date_last_br
       if date_last > -1:
           hyphen = article.find(u'-', date_first,date_last)
           if hyphen > -1:
               date_first = hyphen+1
           else:
               date_first += 12

           parts = article[date_first:date_last].strip().split(' ')
           if len(parts[0])==1:
              parts[0] = '0'+parts[0]
           if parts[0][0]==':':
              parts[0] = '0'+parts[0][1]
           parts[1] = parts[1][0:3]
           return ' '.join(parts)

growthsG = {}

def CalculatePlacing(growths,oldid,update):
    global growthsG
    growthsG = growths
    lang_keys = growths.keys()
    lang_keys.sort(key=lambda x: growthsG[x], reverse=True)
    placeNo=0

    print update

    placing = []
    for lang in lang_keys:
        if placeNo < 3 or growths[lang] > 1:
           placeNo += 1
           if placeNo==1:
              placestr = '1st Place'
              ribbonimg = 'Article blue.svg'
           elif placeNo==2:
              placestr = '2nd Place'
              ribbonimg = 'Article red.svg'
           elif placeNo==3:
              placestr = '3rd Place'
              ribbonimg = 'Article yellow.svg'
           elif placeNo>3:
              placestr = 'Honorable Mention'
              ribbonimg = 'Article green.svg'

           print " %d  %-3s %+2.2f" % (placeNo, lang, growths[lang])

           place = {'lang':lang,'growth':growths[lang],'oldid':oldid,'update':update,'placestr':placestr,'ribbonimg':ribbonimg}
           placing.append(place) 
    return placing


def GetPreviousAwards():

    article_name = 'List of Wikipedias by sample of articles'

    meta_wiki = pywikibot.Site('meta', 'meta')
    meta_page = pywikibot.Page(meta_wiki, article_name)
    
    awards = {}
    prevUpdate = ''
    prevGrowth = -999

    for oldid,datetime,username,comments,cursize,curx in meta_page.getVersionHistory():
        if ('2009' in datetime or '2010' in datetime or '2011' in datetime or '2012' in datetime or '2013' in datetime) and ("updat" in comments.lower() or 'correct' in comments.lower()) and oldid!=2228213 and oldid!=2264612 and oldid!=3122655 and oldid!=3359817:
            article   = meta_page.getOldVersion(get_redirect=False,oldid=oldid)
            growths = GetGrowths(article)
            if 'en' in growths:
                update = GetLastUpdated(article)
                growth = growths['en']
                if update != prevUpdate and ( prevGrowth != growth or oldid > 3807780 ):
                    prevUpdate = update
                    prevGrowth = growth
                    awards[update] = CalculatePlacing(growths,oldid,update)
    return awards                

def HasAwards(awards, lang):
    
    for placings in awards.values():
        for place in placings:
            if lang == place['lang']:
                return True
    return False

def CompareRows(rowA,rowB):
    if rowA['place']['placestr']==rowB['place']['placestr']:
        return cmp(rowB['place']['growth'],rowA['place']['growth'])
    return cmp(rowA['place']['placestr'],rowB['place']['placestr'])

def GetWikiAwards(awards):
    table = u'==2009-2013 Improvement Awards==\n'
    for lang in lang_keys:
        section = u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
        rows = []
        for update, placings in awards.items():
           for place in placings:
               if lang == place['lang']:
                  mid_section = '|-\n'
                  mid_section += '|width = 150 | [[Image:%s|20px]] %s\n' % (place['ribbonimg'],place['placestr'])
                  if place['oldid'] == -1:  
                      mid_section += '|width = 120 align=center| [[:m:List of Wikipedias by sample of articles|%s]]\n' % (place['update'])
                  else:  
                      mid_section += '|width = 120 align=center| <span class="plainlinks">[http://meta.wikimedia.org/w/index.php?title=List_of_Wikipedias_by_sample_of_articles&oldid=%s %s]</span>\n' % (place['oldid'],place['update'])
                  mid_section += '|width = 80 align=center| %+2.2f\n' % round(place['growth'],2)
                  rows.append({'place':place,'mid_section':mid_section})
        if len(rows) > 0:
            rows.sort(CompareRows)
            if len(rows) > 1:
                section += '{|class="wikitable sortable" cellpadding="6" cellspacing="0"\n'
                section += '! !! !!\n'
            else:
                section += '{|class="wikitable" cellpadding="6" cellspacing="0"\n'
            for row in rows:
                section += row['mid_section']
            section += '|}\n'
            table += section
    return table

def CalculateAwards():

    print "calculating awards..."
    
    todays = {}
    for lang in lang_keys:
        absent        = lang_info[lang]['absent']
        stubs         = lang_info[lang]['stubs']
        articles      = lang_info[lang]['articles']
        longarticles  = lang_info[lang]['longarticles']
        score         = GetScore(absent, stubs, articles, longarticles)
        growth        = GetGrowthNumber(lang, score)
        todays[lang] = growth

    update = strftime("%d %b %Y")
    placing = CalculatePlacing(todays,-1,update)

    awards = GetPreviousAwards()
    awards[update] = placing
    return awards

def GetArticleExists(lang, artKey):

    size = GetArticleSize(lang, artKey)
    if size > 0:
       return 1
    else:
       return 0

def GetNumberExists(artKey):
    total = sum(GetArticleExists(lang, artKey) for lang in lang_keys)
    return total

def GetMostCommonList():
    artInfos = []

    for index in range(len(item_list)):
      artKey =  'art_'+str(index)
      artInfos.append( {} )
      artInfos[index]['artKey']  = artKey
      artInfos[index]['numberExists']  = GetNumberExists(artKey)
      artInfos[index]['averageSize']  = GetAverageArticleSize(artKey)
    artInfos.sort(key=lambda x: (x['numberExists'], x['averageSize']), reverse=True)
    most_common_list = [index['artKey'] for index in artInfos]
    return most_common_list

def WriteShortestSection(lang, most_common_list):
    artInfos = []

    for artKey in most_common_list:
        newDict = {}
        newDict['artKey']  = artKey
        newDict['length']  = GetArticleSize(lang, artKey)
        artInfos.append( newDict )
    artInfos.sort(key=lambda x: (x['length']), reverse=False)
    count=0

    section = u'\n==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
    for index in artInfos:
        artKey  = index['artKey']
        artWtSize =  '%d' %index['length']
        section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] ' + str(artWtSize) + ' ' + lang_info[lang][artKey]['error'] + '\n'
        if count > 199:
            return section
        count += 1
    return section

def GetWikiShortestArticles():
    lang_keys.sort()
    table = u'The 200 shortest articles found when generating the wikipedia\'s score. Wikipedias are excluded from this list if their score is < 40.'
    table += u'\n\n'
    most_common_list = GetMostCommonList()

    for lang in lang_keys:
        score = GetScoreForLang(lang)
        if score > 40:
            table += WriteShortestSection(lang, most_common_list)
    table += u'\n[[Category:Lists of Wikipedias|Expanded sample]]'
	
    return table

def SavePreviousScore():
	article_name = 'List of Wikipedias by expanded sample of articles'
	 
	meta_wiki = pywikibot.Site('meta', 'meta')
	meta_page = pywikibot.Page(meta_wiki, article_name)
	article   = meta_page.get(get_redirect=False)
	 
	f = open('PreviousScores.txt', 'w')
	count = 0
	lang_last  = 0
	lang_first = article.find(u'[[:', lang_last)
	while lang_first > -1:
		lang_last  = article.find(u'|',  lang_first)
	 
		lang = article[lang_first+3:lang_last-1]
	 
		score_first = article.find(u'style = "background:',lang_last)
		score_last  = article.find(u'|', score_first+32)
	 
		score = article[score_first+31:score_last-1]
	 
		f.write(lang + ' ' + score + '\n')
	 
		count += 1
		print count, lang, score
		lang_first = article.find(u'[[:', score_last)
	 
	f.close()	
	

def WriteResultsToFile():

    print 'writing Results...'
    f = open('results.txt', 'w')
    f.write(GetWikiTableResults().encode("utf_8"))
    f.close()

    print 'writing Shortest...'
    f = open('_shortest.txt', 'w')
    f.write(GetWikiShortestArticles().encode("utf_8"))
    f.close()

#support dividing up work
if len(sys.argv) == 3:
    part      = int(sys.argv[1])-1
    numparts  = int(sys.argv[2])
    lang_keys = filter(lambda lang: lang_keys.index(lang) % numparts == part, lang_keys)

def oldMain():
    GetPreviousScores()
    CalculateStatistics()
    awards = CalculateAwards()
    PrintResults()
    SaveWikiTableResults(awards)

def main():
    SavePreviousScore()
    GetPreviousScores()
    GetItemList()
    GetIwLinks()
    CalculateStatistics()
    PrintResults()
    WriteResultsToFile()

if __name__ == '__main__':
    try:
        main()
    finally:
        pywikibot.stopme()