Translating Dictionary
From Meta, a Wikimedia project coordination wiki
This is a translating and disambiguating dictionary, which uses raw wikipedia data to get the translations and disambiguations. I wrote it (Simon Kissane).
<? class WikipediaArticle { var $lang; var $title = ""; var $text; var $in; function getURL($lang) { return "http://{$lang}.wikipedia.org/wiki/Special%3AExport/"; } function WikipediaArticle($art,$lang="en") { $this->lang = $lang; $this->get_article($art,$lang); } function get_article_xml($art,$lang) { return file_get_contents($this->getURL($lang) . urlencode($art)); } function xml_start($xp,$tag,$attrs) { if ($tag == "title") $this->in = "title"; if ($tag == "text") $this->in = "text"; } function xml_end($xp,$tag) { if ($this->in == $tag) $this->in = NULL; } function xml_cdata($xp,$data) { if ($this->in == "title") $this->title .= $data; if ($this->in == "text") $this->text .= $data; } function get_article($art,$lang="en") { $xml = $this->get_article_xml($art,$lang); $xp = xml_parser_create(); xml_parser_set_option($xp, XML_OPTION_CASE_FOLDING, 0); xml_set_element_handler($xp, array(&$this,"xml_start"), array(&$this,"xml_end")); xml_set_character_data_handler($xp, array(&$this,"xml_cdata")); xml_parse($xp,$xml,TRUE); } function getLangs() { return array("aa" => "Afar", "ab" => "Abkhazian", "af" => "Afrikaans", "als" => "Alsatian", "am" => "Amharic", "an" => "Aragonese", "ar" => "Arabic", "as" => "Assamese", "ast" => "Asturian", "ay" => "Aymara", "az" => "Azeri", "ba" => "Bashkir", "bal" => "Baluchi", "ban" => "Balinese", "be" => "Belorussian", "ber" => "Berber (Tamazight)", "bg" => "Bulgarian", "bh" => "Bihari", "bi" => "Bislama", "bn" => "Bengali", "bo" => "Tibetan", "br" => "Breton", "bs" => "Bosnian", "bug" => "Buginese", "ca" => "Catalan", "ceb" => "Cebuano", "ch" => "Chamoru", "che" => "Chechen", "chm" => "Meadow Mari", "chr" => "Cherokee", "chv" => "Chuvash", "co" => "Corsican", "crh" => "Crimean Tatar", "cs" => "Czech", "csb" => "Kashubian", "cy" => "Welsh", "da" => "Danish", "de" => "German", "diu" => "Diudish", "div" => "Dhivehi", "dz" => "Dzongkha", "el" => "Greek", "en" => "English", "eo" => "Esperanto", "es" => "Spanish", "eso" => "Ekspreso", "et" => "Estonian", "eu" => "Basque", "fa" => "Persian", "fi" => "Finnish", "fiu" => "Karelian", "fj" => "Fijian", "fo" => "Faeroese", "fr" => "French", "fy" => "Frisian", "ga" => "Irish", "gay" => "Gayo", "gd" => "Scottish Gaelic", "gl" => "Galician", "gn" => "Guarani", "gs" => "Glosa", "gu" => "Gujarati", "gv" => "Manx Gaelic", "ha" => "Hausa", "he" => "Hebrew", "hi" => "Hindi", "hr" => "Croatian", "hu" => "Hungarian", "hy" => "Armenian", "ia" => "Interlingua", "iba" => "Iban", "id" => "Indonesian", "ie" => "Interlingue (ex occidental)", "ik" => "Inupiak", "is" => "Icelandic", "it" => "Italian", "iu" => "Inuktitut", "ja" => "Japanese", "jv" => "Javanese", "ka" => "Georgian", "kaw" => "Kawi", "kk" => "Kazakh", "kl" => "Greenlandic", "km" => "Khmer", "kn" => "Kannada", "ko" => "Korean", "ks" => "Kashmiri", "ku" => "Kurdish", "ky" => "Kirghiz (also Kyrgyz)", "la" => "Latin", "li" => "Limburgian", "ln" => "Lingala", "lo" => "Laotian", "ls" => "Latino Sine Flexione", "lt" => "Lithuanian", "lv" => "Latvian", "mad" => "Madurese", "mak" => "Makasar", "mg" => "Malagasy", "mi" => "Maori", "min" => "Minangkabau", "mk" => "Macedonian", "ml" => "Malayalam", "mn" => "Mongolian", "mo" => "Moldovan", "mr" => "Marathi", "ms" => "Malay", "mt" => "Maltese", "my" => "Burmese", "na" => "Nauri", "nah" => "Nahuatl", "nds" => "Low Saxon", "ne" => "Nepali", "ng" => "Ndongo", "nl" => "Dutch", "no" => "Norwegian", "oc" => "Occitan", "om" => "Oromo", "or" => "Oriya", "pa" => "Punjabi", "pl" => "Polish", "ps" => "Pashto", "pt" => "Portuguese", "qu" => "Quechua", "ra" => "Romanica", "rm" => "Rhaeto-Romance", "rn" => "Kirundi", "ro" => "Romanian", "roa-rup" => "Aromanian", "ru" => "Russian", "rw" => "Kinyarwanda", "sa" => "Sanskrit", "sc" => "Sardinian", "sd" => "Sindhi", "sg" => "Sangro", "sh" => "Serbo-Croatian", "si" => "Singhalese", "sk" => "Slovak", "sl" => "Slovene", "sm" => "Samoan", "sn" => "Shona", "son" => "Songhay", "sq" => "Albanian", "sr" => "Serbian", "ss" => "Siswati", "st" => "Sesotho", "su" => "Sundanese", "sv" => "Swedish", "sw" => "Swahili", "ta" => "Tamil", "te" => "Telugu", "tg" => "Tajik", "th" => "Thai", "ti" => "Tigrinya", "tk" => "Turkmen", "tl" => "Tagalog", "tlh" => "Klingon", "tn" => "Setswana", "to" => "Tonga", "tokipona" => "Toki Pona", "tpi" => "Tok Pisin", "tr" => "Turkish", "ts" => "Tsonga", "tt" => "Tatar", "tw" => "Twi", "tzm" => "Tamazight", "udm" => "Udmurt", "ug" => "Uighur", "uk" => "Ukrainian", "ur" => "Urdu", "uz" => "Uzbek", "vi" => "Vietnamese", "vo" => "Volapuk", "wo" => "Wolof", "xh" => "Xhosa", "yi" => "Yiddish", "yo" => "Yoruba", "za" => "Zhuang", "zh" => "Chinese", "zh-cfr" => "Min-nan", "zu" => "Zulu"); } function decodeLang($lang) { $langs = $this->getLangs(); if (array_key_exists($lang,$langs)) return $langs[$lang]; else return $lang; } function form($word="",$askLang="en") { echo "<form method='get' action=''>\n"; echo "<input type='text' name='word' id='word' value='{$word}' />\n"; echo "<select name='lang'>\n"; $langs = WikipediaArticle::getLangs(); asort($langs); foreach ($langs as $code => $name) { echo "<option value=\"$code\""; if ($code == $askLang) echo " selected "; echo ">$name</option>\n"; } echo "</select>\n"; echo "<input type='submit' value='Translate' />\n"; echo "</form>\n"; } function getRelatedTerms() { preg_match_all("/\[\[([^]:]+)\|([^]:]+)\]\]/",$this->text,$matches); $terms = array(); foreach ($matches[1] as $term) { $term = urldecode(str_replace("_"," ",$term)); if (!preg_match("/^[0-9]+$|^#/",$term)) array_push($terms,$term); } preg_match_all("/\[\[([^]:|]+)\]\]/",$this->text,$matches); foreach ($matches[1] as $term) { $term = urldecode(str_replace("_"," ",$term)); if (!preg_match("/^[0-9]+$|^#/",$term)) array_push($terms,$term); } return array_unique($terms); } function getTranslations() { $trans = array(); preg_match_all("/\[\[([a-z][a-z][a-z]?(?:-[^:]*)?):([^]]*)]]/",$this->text,$matches); for ($i = 0; $i < count($matches[1]); $i++) { $word = urldecode($matches[2][$i]); $lang = $this->decodeLang($matches[1][$i]); $trans[$matches[1][$i]] = array("word" => $word,"lang" => $lang); } return $trans; } function getDisambiguations() { $u = new WikipediaArticle($this->title . " (disambiguation)",$this->lang); $disambig = $u->getRelatedTerms(); if (strpos($this->text,"{{disambig}}") !== FALSE) $disambig = array_merge($this->getRelatedTerms(),$disambig); foreach ($disambig as $i => $tm) if (stripos($tm,$_REQUEST["word"]) === FALSE) unset($disambig[$i]); return array_unique($disambig); } } header("Content-Type: text/html; charset=UTF-8"); ?> <html> <head> <title>Translating and Disambiguating Dictionary</title> <style> body { background: #ffcc77; } h1, h2 { color: #770077; } </style> </head> <body> <? if (array_key_exists("source",$_REQUEST)) { show_source($_SERVER['SCRIPT_FILENAME']); exit; } if (array_key_exists("lang",$_REQUEST) && $_REQUEST["lang"] != "") { $askLang = $_REQUEST["lang"]; } else { $askLang = "en"; } if (!array_key_exists("word",$_REQUEST) || $_REQUEST["word"] == "") { echo "<h1>Translating Dictionary</h1>\n"; echo "<p>Written by Simon Kissane</p>\n"; WikipediaArticle::form("",$askLang); echo "Based on <a href=\"http://www.wikipedia.org\">Wikipedia</a><br />\n"; } else { $a = new WikipediaArticle($_REQUEST["word"],$askLang); echo "<h1>Translating Dictionary: {$a->title}</h1>\n"; echo "<p>Written by Simon Kissane</p>\n"; WikipediaArticle::form($_REQUEST["word"],$askLang); // Find synonym if (preg_match("/#REDIRECT\s*\[\[([^]]*)\]\]/",$a->text,$matches)) { echo "<p><b>Synonym of:</b> <a href=\"?word={$matches[1]}&lang=$askLang\">{$matches[1]}</a></p>\n"; } // Find translations $trans = $a->getTranslations(); echo "<h2>Translations</h2>\n"; if (count($trans) == 0) echo "<p>No translations available</p>\n"; else { echo "<table>\n"; echo "<tr><th>Word</th><th>Language</th></tr>\n"; foreach ($trans as $code => $t) { echo "<tr><td><a href=\"?word={$t['word']}&lang={$code}\">{$t['word']}</a></td><td>{$t['lang']}</td></tr>\n"; } echo "</table>\n"; } // Find disambiguations $disambig = $a->getDisambiguations(); echo "<h2>Disambiguations</h2>\n"; if (count($disambig) == 0) echo "<p>No disambiguations known.</p>\n"; foreach ($disambig as $tm) { echo "<a href=\"?word={$tm}&lang={$askLang}\">{$tm}</a><br />\n"; } echo "<p>Based on the <a href='http://{$askLang}.wikipedia.org/wiki/" . urlencode($_REQUEST["word"]) . "'>Wikipedia article</a>.</p>\n"; } echo "<p><a href=\"?source\">View source</a></p>\n"; ?> </body> </html>