User talk:Alecmconroy/Language study

From Meta, a Wikimedia project coordination wiki

This was code used to generate the various files. It's not meant to be reused and is more more an "interactive session log" than code. But you want to know how I went from the csv to the other files, this is how


Code[edit]

<?php
//echo "hi <br/>\n";

// import csv to assoc array
$arrResult = array();
//$arrLines = file('wm-language.csv');

$file_handle = fopen("wm-language.csv", "r");
$csv_contents= array();
$row_headings= array();
$resultsAssoc=array();
$connectionMatrix=array();
$langcodes=array();

$counter=0;
while (!feof($file_handle) ) {
    $line_of_text = fgetcsv($file_handle, 1024);
    //print $line_of_text[0] . $line_of_text[1]. $line_of_text[2] . "<BR>";
    $csv_contents[]=$line_of_text;
    $row_headings[]=$line_of_text[0];
    if ($counter==0) {$column_headings=$line_of_text;}
    if ($counter>4)
    {
        $langcodes[]=$line_of_text[0];
        $row_lang=$line_of_text[0];
        foreach ($line_of_text as $key=>$entry)
        {
            //echo $key;
            //echo "\n";
            //echo $entry;
            if ($key>0)
            {
                
            $resultsAssoc[$line_of_text[0]][$column_headings[$key]]=$entry;
            }
        }                                    
    }
    $counter++;
}
fclose($file_handle);
//print_r($resultsAssoc);

// save ***********************************************
/*$matrix_serialized = serialize($resultsAssoc);
file_put_contents('wm-language-matrix-serialized', $s);*/


// Sanity Check-- if everything went okay, this should look like Active Users ******************
/*
   echo "which individual languages can instantly reach most active users??<br/>";
   foreach ($langcodes as $langcode)
   {
       echo "$langcode , ". $resultsAssoc[$langcode][$langcode] ."<br/>";
   }
// It did.   
*/



// which language has most bilingual speakers
// technically incorrect, as a person who speaks five languages will get counted five times
// but this makes logical sense, as such a person is at least five times as valuable for communication
/*
echo "ACE active users: ".$resultsAssoc["ace"]["ace"] ."\n";

   echo "which individual languages has the most multilingual active users??<br/>";
    $total_multilingual_by_project= array();
   foreach ($langcodes as $langcode)
   {
       $total_multilingual_by_project[$langcode]=0;
       $sum=0;
       foreach ($langcodes as $langcodeb)
       {
            $sum=$sum+$resultsAssoc[$langcode][$langcodeb];
            if ($resultsAssoc[$langcode][$langcodeb]!=$resultsAssoc[$langcodeb][$langcode]) echo "error: $langcode $langcodeb\n";
       }
       $sum=$sum-$resultsAssoc[$langcode][$langcode];
       $total_multilingual_by_project[$langcode]=$sum;
       echo "$langcode , ".$sum."\n";
   }
*/
//	'ru',	'ja',	'es',	'it',	'pl',	'zh',	'nl',	'pt',	'ar',	'hi',	'sv',	'he',	'hu',	'fi',	'cs',	'uk',	'ko',	'no',	'ca',	'tr',	'fa',	'da',	'bg',	'ro',	'id',	'th',	'vi',	'sr',	'el',	'eo',	'sk',	'simple',	'hr',	'lt',	'et',	'sl',	'ka',	'az',	'lv',	'eu',	'gl',	'ml',	'ta',	'la',	'mk',	'ms',	'b',e'',-'',x'',-'old',	'nn',	'be',	'cy',	'bs',	'oc',	'lb',	'ga',	'af',	'is',	'jv',	'sh',	'hy',	'tl',	'sq',	'an',	'bn',	'br',	'mr',	'ast',	'sw',	'tt',	'war',	'io',	'bar',	'lmo',	'qu',	'fy',	'kk',	'ku',	'te',	'z',h'',-'mi',n'',-'nan',	'als',	'ceb',	'scn',	'os',	'pms',	'z',h'',-'yue',	'ia',	'ur',	'cv',	'gd',	'kn',	'uz',	'vo',	'gv',	'hsb',	'ht',	'li',	'mn',	'nah',	'bpy',	'nap',	'tg',	'yi',	'yo',	'ba',t'',-'smg',	'fo',	'nds',	'su',	'arz',	'hif',	'vec',	'am',	'gan',	'my',	'pam',	'sah',	'pnb',	'se',	'wa',	'ksh',	'ne',	'sco',	'vls',	'gu',	'nd',s'',-'nl',	'sa',	'ang',	'co',	'frp',	'fur',	'ckb',	'mt',	'new',	'nov',	'si',	'wuu',	'bcl',	'diq',	'fi',u'',-'vro',	'kw',	'mhr',	'nrm',	'szl',	'csb',	'ps',	'so',	'tk',	'ba',	'lad',	'ln',	'z',h'',-'classical',	'cb',k'',-'zam',	'dsb',	'ie',	'lij',	'ma',p'',-'bms',	'mi',	'rm',	'stq',	'arc',	'ay',	'bo',	'crh',	'gn',	'km',	'kv',	'mg',	'nv',	'sc',	'bug',	'dv',	'eml',	'ext',	'ilo',	'kl',	'ky',	'pdc',	'udm',	'wo',	'xal',	'jbo',	'krc',	'mdf',	'pap',	'pa',	'ro',a'',-'rup',	'tet',	'ug',	'zea',	'cu',	'hak',	'haw',	'koi',	'mwl',	'myv',	'or',	'tpi',	'ace',	'av',	'ce',	'kab',	'mrj',	'mzn',	'pcd',	'pi',	'ro',a'',-'tara',	'ab',	'bh',	'bjn',	'ig',	'lo',	'na',	'pag',	'rw',	'sm',	'bm',	'ee',	'frr',	'ik',	'iu',	'kg',	'lbe',	'pfl',	'pnt',	'srn',	'ss',	'ty',	'kaa',	'pih',	'rmy',	'to',	'bi',	'cdo',	'chr',	'fj',	'gag',	'dz',	'ks',	'ltg',	'ten',	'ts',	'cr',	'ff',	'glk',	'got',	'kbd',	'ny',	'tum',	've',	'za',	'ak',	'as',	'bxr',	'ch',	'ha',	'lg',	'om',	'rn',	'sd',	'sg',	'st',	'ti',	'tn', 'xh');   
$langlist= array('en','de', 'fr',//	'ru',	'ja',	'es',	'it',	'pl',	'zh',	'nl',	'pt',	'ar',	'hi',	'sv',	'he',	'hu',	'fi',	'cs',	'uk',	'ko',	'no',	'ca',	'tr',	'fa',	'da',	'bg',	'ro',	'id',	'th',	'vi',	'sr',	'el',	'eo',	'sk',	'simple',	'hr',	'lt',	'et',	'sl',	'ka',	'az',	'lv',	'eu',	'gl',	'ml',	'ta',	'la',	'mk',	'ms',	'b',e'',-'',x'',-'old',	'nn',	'be',	'cy',	'bs',	'oc',	'lb',	'ga',	'af',	'is',	'jv',	'sh',	'hy',	'tl',	'sq',	'an',	'bn',	'br',	'mr',	'ast',	'sw',	'tt',	'war',	'io',	'bar',	'lmo',	'qu',	'fy',	'kk',	'ku',	'te',	'z',h'',-'mi',n'',-'nan',	'als',	'ceb',	'scn',	'os',	'pms',	'z',h'',-'yue',	'ia',	'ur',	'cv',	'gd',	'kn',	'uz',	'vo',	'gv',	'hsb',	'ht',	'li',	'mn',	'nah',	'bpy',	'nap',	'tg',	'yi',	'yo',	'ba',t'',-'smg',	'fo',	'nds',	'su',	'arz',	'hif',	'vec',	'am',	'gan',	'my',	'pam',	'sah',	'pnb',	'se',	'wa',	'ksh',	'ne',	'sco',	'vls',	'gu',	'nd',s'',-'nl',	'sa',	'ang',	'co',	'frp',	'fur',	'ckb',	'mt',	'new',	'nov',	'si',	'wuu',	'bcl',	'diq',	'fi',u'',-'vro',	'kw',	'mhr',	'nrm',	'szl',	'csb',	'ps',	'so',	'tk',	'ba',	'lad',	'ln',	'z',h'',-'classical',	'cb',k'',-'zam',	'dsb',	'ie',	'lij',	'ma',p'',-'bms',	'mi',	'rm',	'stq',	'arc',	'ay',	'bo',	'crh',	'gn',	'km',	'kv',	'mg',	'nv',	'sc',	'bug',	'dv',	'eml',	'ext',	'ilo',	'kl',	'ky',	'pdc',	'udm',	'wo',	'xal',	'jbo',	'krc',	'mdf',	'pap',	'pa',	'ro',a'',-'rup',	'tet',	'ug',	'zea',	'cu',	'hak',	'haw',	'koi',	'mwl',	'myv',	'or',	'tpi',	'ace',	'av',	'ce',	'kab',	'mrj',	'mzn',	'pcd',	'pi',	'ro',a'',-'tara',	'ab',	'bh',	'bjn',	'ig',	'lo',	'na',	'pag',	'rw',	'sm',	'bm',	'ee',	'frr',	'ik',	'iu',	'kg',	'lbe',	'pfl',	'pnt',	'srn',	'ss',	'ty',	'kaa',	'pih',	'rmy',	'to',	'bi',	'cdo',	'chr',	'fj',	'gag',	'dz',	'ks',	'ltg',	'ten',	'ts',	'cr',	'ff',	'glk',	'got',	'kbd',	'ny',	'tum',	've',	'za',	'ak',	'as',	'bxr',	'ch',	'ha',	'lg',	'om',	'rn',	'sd',	'sg',	'st',	'ti',	'tn', 'xh');
'ru','ja','es','it','pl','zh','nl','pt','ar','hi','sv','he','hu','fi','cs','uk','ko','no','ca','tr','fa','da','bg',	'ro','id','th','vi','sr','el','eo','sk','simple','hr','lt','et','sl','ka','az','lv','eu','gl','ml','ta','la','mk','ms','be-x-old','nn','be','cy','bs','oc','lb','ga','af','is','jv','sh','hy','tl','sq','an','bn','br','mr','ast','sw','tt','war','io','bar','lmo','qu','fy','kk','ku','te','zh-min-nan',	'als',	'ceb',	'scn',	'os',	'pms',	'zh-yue',	'ia',	'ur',	'cv',	'gd',	'kn',	'uz',	'vo',	'gv',	'hsb',	'ht',	'li',	'mn',	'nah',	'bpy',	'nap',	'tg',	'yi',	'yo',	'bat-smg',	'fo',	'nds',	'su',	'arz',	'hif',	'vec',	'am',	'gan',	'my',	'pam',	'sah',	'pnb',	'se',	'wa',	'ksh',	'ne',	'sco',	'vls',	'gu',	'nds-nl',	'sa',	'ang',	'co',	'frp',	'fur',	'ckb',	'mt',	'new',	'nov',	'si',	'wuu',	'bcl',	'diq',	'fiu-vro',	'kw',	'mhr',	'nrm',	'szl',	'csb',	'ps',	'so',	'tk',	'ba',	'lad',	'ln',	'zh-classical',	'cbk-zam',	'dsb',	'ie',	'lij',	'map-bms',	'mi',	'rm',	'stq',	'arc',	'ay',	'bo',	'crh',	'gn',	'km',	'kv',	'mg',	'nv',	'sc',	'bug',	'dv',	'eml',	'ext',	'ilo',	'kl',	'ky',	'pdc',	'udm',	'wo',	'xal',	'jbo',	'krc',	'mdf',	'pap',	'pa',	'roa-rup',	'tet',	'ug',	'zea',	'cu',	'hak',	'haw',	'koi',	'mwl',	'myv',	'or',	'tpi',	'ace',	'av',	'ce',	'kab',	'mrj',	'mzn',	'pcd',	'pi',	'roa-tara',	'ab',	'bh',	'bjn',	'ig',	'lo',	'na',	'pag',	'rw',	'sm',	'bm',	'ee',	'frr',	'ik',	'iu',	'kg',	'lbe',	'pfl',	'pnt',	'srn',	'ss',	'ty',	'kaa',	'pih',	'rmy',	'to',	'bi',	'cdo',	'chr',	'fj',	'gag',	'dz',	'ks',	'ltg',	'ten',	'ts',	'cr',	'ff',	'glk',	'got',	'kbd',	'ny',	'tum',	've',	'za',	'ak',	'as',	'bxr',	'ch',	'ha',	'lg',	'om',	'rn',	'sd',	'sg',	'st',	'ti',	'tn', 'xh');
// unused **********************   
//print_r($total_multilingual_by_project);
//include_once("wm-language-libraries.php");
//$sep=',';
//WriteCsvFile("wm-langage-multilingualsbyproj.csv",$total_multilingual_by_project,$sep);

// which language has greatest "connection diversity"  (dimishing returns of additional bilingual) >50 in language 
// For each language, what are their "conduit languages"-- languages that offer good changes at intercommunication with both them and en
// Which languages are more strongly connected to one of the other world languages than en?

//for each language, what's it's connect to en?
/*
foreach ($langlist as $langlistitem)
{
    if ($langlistitem=='en') continue;
    //if ($langlistitem=='hi') break;
    echo "===$langlistitem===\n";
    $aTemp = $resultsAssoc[$langlistitem];
    asort($aTemp, SORT_NUMERIC);
    $TopFew = array_slice($aTemp, 0, 3, true);

    if ($resultsAssoc[$langlistitem]['en']>$TopFew[1]) {echo "'''";}
    echo "Direct: $langlistitem <-".$resultsAssoc[$langlistitem]['en']."-> en ";     
    if ($resultsAssoc[$langlistitem]['en']>$TopFew[1]) {echo "'''";}
    echo "<br/>\n";
    
    
    $thislangarray= array($resultsAssoc[$langlistitem]);
    asort($thislangarray[0]);
    $thislangarray=array_reverse($thislangarray[0],true);
    $counterb=0;
    foreach ($thislangarray as $indirectlang => $val)
    {  // print_r($thislangarray); echo "\n ^^ $indirectlang $val ^^"; exit();
        
        if ($indirectlang!=$langlistitem && $indirectlang!='en')
        {
            if ($resultsAssoc[$langlistitem][$indirectlang]>$resultsAssoc[$langlistitem]['en']){echo "'''";}
            echo "Indirect: $langlistitem <-".$resultsAssoc[$langlistitem][$indirectlang]."-> $indirectlang <-".$resultsAssoc[$indirectlang]['en']."-> en , multiplicative weight: ".$resultsAssoc[$indirectlang]['en']*$resultsAssoc[$langlistitem][$indirectlang];
            if ($resultsAssoc[$langlistitem][$indirectlang]>$resultsAssoc[$langlistitem]['en']){echo "'''";}
            echo " <br/>\n";
            if ($counterb>5) break;
            $counterb++;
        }
        
    }
}

*/


// make nodes.csv
/*
echo "id,Label,Weight\n";
foreach ($langlist as $key=>$langlistitem)
{
    echo ($key+1)." , $langlistitem,".$resultsAssoc[$langlistitem][$langlistitem]."\n";
}
*/
// make edge length
/*
echo "Source,Target,Weight,Type\n";
foreach ($langlist as $key=>$langlistitem)
{
    foreach ($langlist as $keyb=>$langlistitemb)
    {
    // each language pair
    
    
        if ($langlistitem!=$langlistitemb && $key<$keyb && $resultsAssoc[$langlistitem][$langlistitemb]>0)
        {
            echo ($key+1)." , ".($keyb+1)." , ".$resultsAssoc[$langlistitem][$langlistitemb].",Undirected\n";
        }
    }
}
*/

/*
// Make wm-lang-topfewedges.gdf

// make nodes

echo "nodedef>name VARCHAR,weight INT\n";
//echo "id,Label,Weight\n";
foreach ($langlist as $key=>$langlistitem)
{
    echo "$langlistitem,".$resultsAssoc[$langlistitem][$langlistitem]."\n";
}

// Make Edges
echo "edgedef>node1 VARCHAR,node2 VARCHAR,weight INT,directed BOOLEAN, topedge BOOLEAN\n";


foreach ($langlist as $key=>$langlistitem)
{
  
    $aTemp = $resultsAssoc[$langlistitem];
    asort($aTemp, SORT_NUMERIC);
    $aTemp = array_reverse($aTemp);
    $TopFew = array_slice($aTemp, 0, 3, true);
    
//    print_r($TopFew);   
    
    foreach ($TopFew as $keyb=>$langlistitemb)
    {
        // each lang pair
        
        
        if ($langlistitem!=$keyb && $resultsAssoc[$langlistitem][$keyb]>0) // no self-connections, no 0 weights.
        {
            
            echo "$langlistitem,$keyb,".$resultsAssoc[$langlistitem][$keyb].",False\n";
        }
    }
}

echo "nodedef>name VARCHAR,weight INT\n";
//echo "id,Label,Weight\n";
foreach ($langlist as $key=>$langlistitem)
{
    echo "$langlistitem,".$resultsAssoc[$langlistitem][$langlistitem]."\n";
}

// Make Edges
echo "edgedef>node1 VARCHAR,node2 VARCHAR,weight INT,directed BOOLEAN, topedge BOOLEAN\n";


foreach ($langlist as $key=>$langlistitem)
{
  
    $aTemp = $resultsAssoc[$langlistitem];
    asort($aTemp, SORT_NUMERIC);
    $aTemp = array_reverse($aTemp);
    $TopFew = array_slice($aTemp, 0, 3, true);
    
//    print_r($TopFew);   
    
    foreach ($TopFew as $keyb=>$langlistitemb)
    {
        // each lang pair
        
        
        if ($langlistitem!=$keyb && $resultsAssoc[$langlistitem][$keyb]>0) // no self-connections, no 0 weights.
        {
            
            echo "$langlistitem,$keyb,".$resultsAssoc[$langlistitem][$keyb].",False\n";
        }
    }
}
*/
/*
// Make wm-lang-top-one 
// todo later:  world languages, weight according translation priorities
// todo later:  parent language only, only world languages leaked to parent company.   


echo "nodedef>name VARCHAR,weight INT\n";
//echo "id,Label,Weight\n";
foreach ($langlist as $key=>$langlistitem)
{
    echo "$langlistitem,".$resultsAssoc[$langlistitem][$langlistitem]."\n";
}

// Make Edges
echo "edgedef>node1 VARCHAR,node2 VARCHAR,weight INT,directed BOOLEAN, topedge BOOLEAN\n";


foreach ($langlist as $key=>$langlistitem)
{
  
    $aTemp = $resultsAssoc[$langlistitem];
    asort($aTemp, SORT_NUMERIC);
    $aTemp = array_reverse($aTemp);
    $TopFew = array_slice($aTemp, 0, 3, true);
    
//    print_r($TopFew);   
    
    foreach ($TopFew as $keyb=>$langlistitemb)
    {
        // each lang pair
        
        
        if ($langlistitem!=$keyb && $resultsAssoc[$langlistitem][$keyb]>0) // no self-connections, no 0 weights.
        {
            
            echo "$langlistitem,$keyb,".$resultsAssoc[$langlistitem][$keyb].",true,True\n";
            break;
        }
    }
}

*/
// Make wm-lang-binarytree
// todo later:  world languages, weight according translation priorities
// todo later:  parent language only, only world languages leaked to parent company.   


echo "nodedef>name VARCHAR,weight INT\n";
//echo "id,Label,Weight\n";
foreach ($langlist as $key=>$langlistitem)
{
    echo "$langlistitem,".$resultsAssoc[$langlistitem][$langlistitem]."\n";
}

// Make Edges
echo "edgedef>node1 VARCHAR,node2 VARCHAR,weight INT,directed BOOLEAN, topedge BOOLEAN\n";

/*
foreach ($langlist as $key=>$langlistitem)
{
  
    $aTemp = $resultsAssoc[$langlistitem];
    asort($aTemp, SORT_NUMERIC);
    $aTemp = array_reverse($aTemp);
    $TopFew = array_slice($aTemp, 0, 3, true);
    
    
//    print_r($TopFew);   
    
    foreach ($TopFew as $keyb=>$langlistitemb)
    {
        // each lang pair
        
        
        if ($langlistitem!=$keyb && $resultsAssoc[$langlistitem][$keyb]>0) // no self-connections, no 0 weights.
        {
            
            echo "$langlistitem,$keyb,".$resultsAssoc[$langlistitem][$keyb].",true,True\n";
            break;
        }
    }
}
*/

$takenlangs=Array();
languagetree(Array('en'));

function languagetree($langlistitemArray)
{
    global $resultsAssoc; global $takenlangs;
    $resultArray=Array();
    foreach ($langlistitemArray as $langlistitem) $takenlangs[$langlistitem]="taken"; // mark this level of nodes as taken;
    // Process each node at this level
    foreach ($langlistitemArray as $langlistitem)
        {
        
        //echo "$langlistitem:";
        // get list of langitem's connections, sorted in order.  
        $aTemp = $resultsAssoc[$langlistitem];
        asort($aTemp, SORT_NUMERIC);
        $topchildren= array_reverse($aTemp);
        
        $counter=0;
        foreach($topchildren as  $keyb=>$langlistitemb)
        {
           if ($takenlangs[$keyb]=="taken") continue;
           if ($takenlangs[$keyb]!="taken" && $langlistitem!=$keyb && $resultsAssoc[$langlistitem][$keyb]>0)
           {
           echo "$langlistitem,$keyb,".$resultsAssoc[$langlistitem][$keyb].",true,True\n";
           $resultArray[]=$keyb; $takenlangs[$keyb]="taken";
           $counter++;
           }
          if ($counter==2) break;
        }   
    }
    languagetree($resultArray); return;
}