Language tools/checkutftable

From Meta, a Wikimedia project coordination wiki
Jump to: navigation, search

Note that program is under GNU GPL, not under GNU FDL. If you contribute to this program, please, add your name inside of copyright notice.

I made this module for checking is the text on Serbian Wikipedia in Cyrillic or in Latin script. However, I realized that it can be useful for other languages, too. Characters which do not belong strictly to any script are described as 'not important'. If character is not recognized by unicodedata module, it is described as 'not known'.

#!/usr/bin/python
#
# Copyright (C) 2006 Milos Rancic
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# 
# Text of GNU GPL license can be found at http://www.gnu.org/licenses/gnu.html

import sys
import unicodedata

# engine
def check(f):
        text = file(f).read().decode('utf-8');
        numchar = {}
        numchar['all'] = 0
        numchar['ni'] = 0 #not important
        numchar['nnnn'] = 0 #not known
        
        numchar['ARAB'] = 0 #ARABIC
        numchar['ARME'] = 0 #ARMENIAN
        numchar['BENG'] = 0 #BENGALI
        numchar['BOPO'] = 0 #BOPOMOFO
        numchar['BRAI'] = 0 #BRAILLE PATTERN
        numchar['BUHI'] = 0 #BUHID
        numchar['CANA'] = 0 #CANADIAN
        numchar['CHER'] = 0 #CHEROKEE
        numchar['CJK '] = 0 #CJK
        numchar['COMB'] = 0 #COMBINING
        numchar['COPT'] = 0 #COPTIC
        numchar['CYRI'] = 0 #CYRILLIC
        numchar['DESE'] = 0 #DESERET
        numchar['DEVA'] = 0 #DEVANGARI
        numchar['ETHI'] = 0 #ETHIOPIC
        numchar['EXTE'] = 0 #EXTENDED ARABIC
        numchar['GEOR'] = 0 #GEORGIAN
        numchar['GOTH'] = 0 #GOTHIC
        numchar['GREE'] = 0 #GREEK
        numchar['GUJA'] = 0 #GUJARATI
        numchar['GURM'] = 0 #GURMUKHI
        numchar['HANG'] = 0 #HANGUL
        numchar['HANU'] = 0 #HANUNOO
        numchar['HEBR'] = 0 #HEBREW
        numchar['HIRA'] = 0 #HIRAGANA
        numchar['IDEO'] = 0 #IDEOGRAPHIC
        numchar['KANG'] = 0 #KANGXI RADICAL
        numchar['KANN'] = 0 #KANNADA
        numchar['KATA'] = 0 #KATAKANA
        numchar['KHME'] = 0 #KHMER
        numchar['LAO '] = 0 #LAO 
        numchar['LATI'] = 0 #LATIN
        numchar['MALA'] = 0 #MALAYALAM
        numchar['MODI'] = 0 #MODIFIER LETTER
        numchar['MONG'] = 0 #MONGOLIAN
        numchar['MYAN'] = 0 #MYANMAR
        numchar['OGHA'] = 0 #OGHAM
        numchar['OLD '] = 0 #OLD ITALIC
        numchar['ORIY'] = 0 #ORIYA
        numchar['PHIL'] = 0 #PHILIPPINE
        numchar['RUNI'] = 0 #RUNIC
        numchar['SINH'] = 0 #SINHALA
        numchar['SYRI'] = 0 #SYRIAC
        numchar['TAGA'] = 0 #TAGALOG
        numchar['TAGB'] = 0 #TAGBANWA
        numchar['TAMI'] = 0 #TAMIL
        numchar['TELU'] = 0 #TELUGU
        numchar['THAA'] = 0 #THAANA
        numchar['THAI'] = 0 #THAI
        numchar['TIBE'] = 0 #TIBETAN
        numchar['YI R'] = 0 #YI RADICAL
        numchar['YI S'] = 0 #YI SYLLABLE

        for t in range(0,len(text)):
                character = unicodedata.name(text[t],"nnnn")
                if len(character) < 4:
                        character += ' '
                char = character[0:4]
                if char not in numchar:
                        char = 'ni'
                numchar[char] += 1
                numchar['all'] += 1
        return numchar

# the list of existing character groups
def describe(d):
        perc = {}
        out = {}
        for n in d:
                if (d[n] > 0) and (n != 'all'):
                        perc[n] = float(d[n])/float(d['all'])
                        out[n] = [ n, perc[n] ]
        return out

# what group of characters has absolute majority, what relative
def decide(d):
        izbor = ''
        perc = {}
        ctrl = 0
        for n in d:
                if (d[n] > 0) and (n != 'all'):
                        perc[n] = float(d[n])/float(d['all'])
                        if perc[n] > ctrl:
                                ctrl = perc[n]
                                czbor = n
                        if perc[n] > 0.5:
                                izbor = n
        if izbor == '':
                out = [ 'r', czbor, ctrl ]
        else:
                out = [ 'a', izbor, perc[izbor] ]
        return out

# for Serbian (Wikipedia): what is the script of the text
def analyzecyr(d):
        izbor = ''
        perc = {}
        ctrl = 0
        if 'CYRI' in d and 'LATI' in d:
                d['cyrlat'] = d['CYRI'] + d['LATI']
                perc['CYRI'] = float(d['CYRI'])/float(d['cyrlat'])
                perc['LATI'] = float(d['LATI'])/float(d['cyrlat'])
                if perc['CYRI'] > 0.5:
                        ra = 'a'
                        izbor = 'CYRI'
                elif perc['CYRI'] > 0.3:
                        ra = 'r'
                        izbor = 'CYRI'
                elif perc['LATI'] > 0.5:
                        ra = 'a'
                        izbor = 'LATI'
                elif perc['LATI'] > 0.3:
                        ra = 'r'
                        izbor = 'LATI'
                else:
                        ra = 'n'
                        izbor = 'n'
                        perc['n'] = 0
        out = [ ra, izbor, perc[izbor] ]
        return out      

# if the program is started from the command line
try:
        fl = sys.argv[1]
        dictionary = check(fl)
        print describe(dictionary)
        print decide(dictionary)
        print analyzecyr(dictionary)
except IndexError:
        pass