Translating Dictionary

From Meta, a Wikimedia project coordination wiki

This is a translating and disambiguating dictionary, which uses raw wikipedia data to get the translations and disambiguations. I wrote it (Simon Kissane).


<?
class WikipediaArticle {
    var $lang;
    var $title = "";
    var $text;

    var $in;

    function getURL($lang) {
        return "http://{$lang}.wikipedia.org/wiki/Special%3AExport/";
    }

    function WikipediaArticle($art,$lang="en") {
        $this->lang = $lang;
        $this->get_article($art,$lang);
    }

    function get_article_xml($art,$lang) {
        return file_get_contents($this->getURL($lang) . urlencode($art));
    }
       
    function xml_start($xp,$tag,$attrs) {
        if ($tag == "title")
            $this->in = "title";
        if ($tag == "text")
            $this->in = "text";
    }

    function xml_end($xp,$tag) {
        if ($this->in == $tag)
            $this->in = NULL;
    }

    function xml_cdata($xp,$data) {
        if ($this->in == "title")
            $this->title .= $data;
        if ($this->in == "text")
            $this->text .= $data;
    }


    function get_article($art,$lang="en") {
        $xml = $this->get_article_xml($art,$lang);
        
        $xp = xml_parser_create();
        xml_parser_set_option($xp, XML_OPTION_CASE_FOLDING, 0);
        xml_set_element_handler($xp,
                                array(&$this,"xml_start"),
                                array(&$this,"xml_end"));
        xml_set_character_data_handler($xp, array(&$this,"xml_cdata"));
        xml_parse($xp,$xml,TRUE);
    }

    function getLangs() {
        return array("aa" => "Afar",
                     "ab" => "Abkhazian",
                     "af" => "Afrikaans",
                     "als" => "Alsatian",
                     "am" => "Amharic",
                     "an" => "Aragonese",
                     "ar" => "Arabic",
                     "as" => "Assamese",
                     "ast" => "Asturian",
                     "ay" => "Aymara",
                     "az" => "Azeri",
                     "ba" => "Bashkir",
                     "bal" => "Baluchi",
                     "ban" => "Balinese",
                     "be" => "Belorussian",
                     "ber" => "Berber (Tamazight)",
                     "bg" => "Bulgarian",
                     "bh" => "Bihari",
                     "bi" => "Bislama",
                     "bn" => "Bengali",
                     "bo" => "Tibetan",
                     "br" => "Breton",
                     "bs" => "Bosnian",
                     "bug" => "Buginese",
                     "ca" => "Catalan",
                     "ceb" => "Cebuano",
                     "ch" => "Chamoru",
                     "che" => "Chechen",
                     "chm" => "Meadow Mari",
                     "chr" => "Cherokee",
                     "chv" => "Chuvash",
                     "co" => "Corsican",
                     "crh" => "Crimean Tatar",
                     "cs" => "Czech",
                     "csb" => "Kashubian",
                     "cy" => "Welsh",
                     "da" => "Danish",
                     "de" => "German",
                     "diu" => "Diudish",
                     "div" => "Dhivehi",
                     "dz" => "Dzongkha",
                     "el" => "Greek",
                     "en" => "English",
                     "eo" => "Esperanto",
                     "es" => "Spanish",
                     "eso" => "Ekspreso",
                     "et" => "Estonian",
                     "eu" => "Basque",
                     "fa" => "Persian",
                     "fi" => "Finnish",
                     "fiu" => "Karelian",
                     "fj" => "Fijian",
                     "fo" => "Faeroese",
                     "fr" => "French",
                     "fy" => "Frisian",
                     "ga" => "Irish",
                     "gay" => "Gayo",
                     "gd" => "Scottish Gaelic",
                     "gl" => "Galician",
                     "gn" => "Guarani",
                     "gs" => "Glosa",
                     "gu" => "Gujarati",
                     "gv" => "Manx Gaelic",
                     "ha" => "Hausa",
                     "he" => "Hebrew",
                     "hi" => "Hindi",
                     "hr" => "Croatian",
                     "hu" => "Hungarian",
                     "hy" => "Armenian",
                     "ia" => "Interlingua",
                     "iba" => "Iban",
                     "id" => "Indonesian",
                     "ie" => "Interlingue (ex occidental)",
                     "ik" => "Inupiak",
                     "is" => "Icelandic",
                     "it" => "Italian",
                     "iu" => "Inuktitut",
                     "ja" => "Japanese",
                     "jv" => "Javanese",
                     "ka" => "Georgian",
                     "kaw" => "Kawi",
                     "kk" => "Kazakh",
                     "kl" => "Greenlandic",
                     "km" => "Khmer",
                     "kn" => "Kannada",
                     "ko" => "Korean",
                     "ks" => "Kashmiri",
                     "ku" => "Kurdish",
                     "ky" => "Kirghiz (also Kyrgyz)",
                     "la" => "Latin",
                     "li" => "Limburgian",
                     "ln" => "Lingala",
                     "lo" => "Laotian",
                     "ls" => "Latino Sine Flexione",
                     "lt" => "Lithuanian",
                     "lv" => "Latvian",
                     "mad" => "Madurese",
                     "mak" => "Makasar",
                     "mg" => "Malagasy",
                     "mi" => "Maori",
                     "min" => "Minangkabau",
                     "mk" => "Macedonian",
                     "ml" => "Malayalam",
                     "mn" => "Mongolian",
                     "mo" => "Moldovan",
                     "mr" => "Marathi",
                     "ms" => "Malay",
                     "mt" => "Maltese",
                     "my" => "Burmese",
                     "na" => "Nauri",
                     "nah" => "Nahuatl",
                     "nds" => "Low Saxon",
                     "ne" => "Nepali",
                     "ng" => "Ndongo",
                     "nl" => "Dutch",
                     "no" => "Norwegian",
                     "oc" => "Occitan",
                     "om" => "Oromo",
                     "or" => "Oriya",
                     "pa" => "Punjabi",
                     "pl" => "Polish",
                     "ps" => "Pashto",
                     "pt" => "Portuguese",
                     "qu" => "Quechua",
                     "ra" => "Romanica",
                     "rm" => "Rhaeto-Romance",
                     "rn" => "Kirundi",
                     "ro" => "Romanian",
                     "roa-rup" => "Aromanian",
                     "ru" => "Russian",
                     "rw" => "Kinyarwanda",
                     "sa" => "Sanskrit",
                     "sc" => "Sardinian",
                     "sd" => "Sindhi",
                     "sg" => "Sangro",
                     "sh" => "Serbo-Croatian",
                     "si" => "Singhalese",
                     "sk" => "Slovak",
                     "sl" => "Slovene",
                     "sm" => "Samoan",
                     "sn" => "Shona",
                     "son" => "Songhay",
                     "sq" => "Albanian",
                     "sr" => "Serbian",
                     "ss" => "Siswati",
                     "st" => "Sesotho",
                     "su" => "Sundanese",
                     "sv" => "Swedish",
                     "sw" => "Swahili",
                     "ta" => "Tamil",
                     "te" => "Telugu",
                     "tg" => "Tajik",
                     "th" => "Thai",
                     "ti" => "Tigrinya",
                     "tk" => "Turkmen",
                     "tl" => "Tagalog",
                     "tlh" => "Klingon",
                     "tn" => "Setswana",
                     "to" => "Tonga",
                     "tokipona" => "Toki Pona",
                     "tpi" => "Tok Pisin",
                     "tr" => "Turkish",
                     "ts" => "Tsonga",
                     "tt" => "Tatar",
                     "tw" => "Twi",
                     "tzm" => "Tamazight",
                     "udm" => "Udmurt",
                     "ug" => "Uighur",
                     "uk" => "Ukrainian",
                     "ur" => "Urdu",
                     "uz" => "Uzbek",
                     "vi" => "Vietnamese",
                     "vo" => "Volapuk",
                     "wo" => "Wolof",
                     "xh" => "Xhosa",
                     "yi" => "Yiddish",
                     "yo" => "Yoruba",
                     "za" => "Zhuang",
                     "zh" => "Chinese",
                     "zh-cfr" => "Min-nan",
                     "zu" => "Zulu");
    }
    function decodeLang($lang) {
        $langs = $this->getLangs();

        if (array_key_exists($lang,$langs))
            return $langs[$lang];
        else
            return $lang;
    }   

    function form($word="",$askLang="en") {
        echo "<form method='get' action=''>\n";
        echo "<input type='text' name='word' id='word' value='{$word}' />\n";
        echo "<select name='lang'>\n";
        $langs = WikipediaArticle::getLangs();
        asort($langs);
        foreach ($langs as $code => $name) {
            echo "<option value=\"$code\"";
            if ($code == $askLang)
                echo " selected ";
            echo ">$name</option>\n";
        }
        echo "</select>\n";
        echo "<input type='submit' value='Translate' />\n";
        echo "</form>\n";
    }

    function getRelatedTerms() {
        preg_match_all("/\[\[([^]:]+)\|([^]:]+)\]\]/",$this->text,$matches);       
        $terms = array();
        foreach ($matches[1] as $term) {
            $term = urldecode(str_replace("_"," ",$term));
            if (!preg_match("/^[0-9]+$|^#/",$term))
                array_push($terms,$term);
        }
        preg_match_all("/\[\[([^]:|]+)\]\]/",$this->text,$matches);       
        foreach ($matches[1] as $term) {
            $term = urldecode(str_replace("_"," ",$term));
            if (!preg_match("/^[0-9]+$|^#/",$term))
                array_push($terms,$term);
        }
        return array_unique($terms);
    }

    function getTranslations() {
        $trans = array();
        preg_match_all("/\[\[([a-z][a-z][a-z]?(?:-[^:]*)?):([^]]*)]]/",$this->text,$matches);   
        for ($i = 0; $i < count($matches[1]); $i++) {
            $word = urldecode($matches[2][$i]);
            $lang = $this->decodeLang($matches[1][$i]);
            $trans[$matches[1][$i]] = array("word" => $word,"lang" => $lang);
        }
        return $trans;
    }

    function getDisambiguations() {
        $u = new WikipediaArticle($this->title . " (disambiguation)",$this->lang);
        $disambig = $u->getRelatedTerms();
        if (strpos($this->text,"{{disambig}}") !== FALSE)
            $disambig = array_merge($this->getRelatedTerms(),$disambig);
        foreach ($disambig as $i => $tm)
            if (stripos($tm,$_REQUEST["word"]) === FALSE)
                unset($disambig[$i]);
        return array_unique($disambig);
    }
}
    
header("Content-Type: text/html; charset=UTF-8");
?>
<html>
<head>
<title>Translating and Disambiguating Dictionary</title>
<style>
body { background: #ffcc77; }
h1, h2 { color: #770077; }
</style>
</head>
<body>
<?
if (array_key_exists("source",$_REQUEST)) {
    show_source($_SERVER['SCRIPT_FILENAME']);
    exit;
}

if (array_key_exists("lang",$_REQUEST) && $_REQUEST["lang"] != "") {
     $askLang = $_REQUEST["lang"];
}
else {
    $askLang = "en";
}

if (!array_key_exists("word",$_REQUEST) ||
    $_REQUEST["word"] == "") {
    echo "<h1>Translating Dictionary</h1>\n";

    echo "<p>Written by Simon Kissane</p>\n";

    WikipediaArticle::form("",$askLang);
    echo "Based on <a href=\"http://www.wikipedia.org\">Wikipedia</a><br />\n";
}
else {
    $a = new WikipediaArticle($_REQUEST["word"],$askLang);

    echo "<h1>Translating Dictionary: {$a->title}</h1>\n";

    echo "<p>Written by Simon Kissane</p>\n";

    WikipediaArticle::form($_REQUEST["word"],$askLang);

    // Find synonym
    if (preg_match("/#REDIRECT\s*\[\[([^]]*)\]\]/",$a->text,$matches)) {
        echo "<p><b>Synonym of:</b> <a href=\"?word={$matches[1]}&lang=$askLang\">{$matches[1]}</a></p>\n";
    }

    // Find translations
    $trans = $a->getTranslations();
    echo "<h2>Translations</h2>\n";
    if (count($trans) == 0) 
        echo "<p>No translations available</p>\n";
    else {
        echo "<table>\n";
        echo "<tr><th>Word</th><th>Language</th></tr>\n";
        foreach ($trans as $code => $t) {
            echo "<tr><td><a href=\"?word={$t['word']}&lang={$code}\">{$t['word']}</a></td><td>{$t['lang']}</td></tr>\n";
        }
        echo "</table>\n";
    }

    // Find disambiguations
    $disambig = $a->getDisambiguations();

    echo "<h2>Disambiguations</h2>\n";    
    if (count($disambig) == 0)
        echo "<p>No disambiguations known.</p>\n";
    foreach ($disambig as $tm) {
        echo "<a href=\"?word={$tm}&lang={$askLang}\">{$tm}</a><br />\n";
    }

    echo "<p>Based on the <a href='http://{$askLang}.wikipedia.org/wiki/" . urlencode($_REQUEST["word"]) . "'>Wikipedia article</a>.</p>\n";
}


echo "<p><a href=\"?source\">View source</a></p>\n";
?>
</body>
</html>