Wikificator/html2wiki.js

From Meta, a Wikimedia project coordination wiki

/*

 Wikificator - an HTML to Mediawiki converter and Typography processor in JavaScript
 Version 0.3.1
 Last update: --17:42, 25 August 2006 (UTC)
 by Shtriter Andrew, http://meta.wikimedia.org/wiki/User:Shtriter


The Wikificator's main part. It includes:

  • Selection handling
  • Html2Wiki transformation function
  • English messages ( can be replaced in local projects )
  • ProcessTypography function prototype ( should be replaced in local projects )

To include it to your project write the following in your project's js file:

mw.loader.load( '//meta.wikimedia.org/w/index.php?title=Wikificator/Wikificator.js&action=raw&ctype=text/javascript' );

Changelog:[edit]

2do list:

  • Add HTML 2 inline wiki transformations
  • Changelog filling
  • UnEscapeInPre - replace with doHTML Entities.
  • Clean the code like FCK does
  • Extend the Html2Wiki - function including:
    • < html >
    • < nowiki >
    • < math >
    • < pre >
    • < gallery >
    • < comment >
    • [[Image:]] [[Media:]]
    • {{ }} and {{{ }}}
      • < noinclude > & < includeonly >
      • MagicWords
    • < User Tags >


The file's content itself:

*/ 

wmFullText = 'Wikificator will process entire article\'s text. Do you want to proceed?';
wmCantWork = 'Wikificator cannot work in your browser';
wmWontWork = 'Wikificator will not work in Netscape 4.x and less';

wmCategoryNS = 'Category';
wmTemplateNS = 'Template';
wmUserNS = 'User';
wmImageNS = 'Image';
wmMediaNS = 'Media';

var wmIntLinkPat = '/wiki/';
//'/wiki/'
//'/index.php/'
//'/w?title='
//'/index.php?title='

var wmLocaleNS = new Array ( wmCategoryNS, wmTemplateNS, wmUserNS, wmImageNS, wmMediaNS );
var wmEnNS = new Array ( 'category', 'template', 'user', 'image', 'media');

//======================================
function Wikify()
{
check_regexp(); // Check whether regular expressions are supported
var txtarea = document.editform.wpTextbox1;
txtarea.focus();

if(document.selection  && !is_gecko)/* IE */ {
	txt = " "+document.selection.createRange().text;
	if (txt == " ")	{all_text();} // If nothing was selected;
	else{
		txt = Process(txt);
		txt = txt.substr (1, txt.length-1);
		document.selection.createRange().text = txt;
		}
	}
else if((txtarea.selectionStart || txtarea.selectionStart == '0')&&(navigator.productSub>20031000)) /*Gecko-browsers older then 10.2003*/  {
 		var startPos = txtarea.selectionStart;
		var endPos = txtarea.selectionEnd;
		var scrollTop=txtarea.scrollTop;
		txt = " "+(txtarea.value).substring(startPos, endPos);
		if (txt == " ")	{all_text();} // If nothing was selected;
		else{
			txt = Process(txt);
			txt = txt.substr (1, txt.length-1);
			txtarea.value = txtarea.value.substring(0, startPos) + txt + txtarea.value.substring(endPos, txtarea.value.length);
			txtarea.focus();
			}
		}
else{if (confirm(wmFullText)) {all_text();}} // Other browsers
}
//======================================
function all_text()// Process all text
{
txt = " "+document.editform.wpTextbox1.value;
txt = Process(txt);
txt = txt.substr (1, txt.length-1);
document.editform.wpTextbox1.value=txt;
}
//======================================
function check_regexp()// Check whether regular expressions are supported
{
var reg1 = "code";
reg1 = reg1.replace(/d/g, "r");
if (reg1 != "core"){alert(wmCantWork);exit;}
b_ver = navigator.appVersion.substr (0, 1);
if (navigator.appName=="Netscape"&&b_ver<5){alert(wmWontWork);exit;}
return ;
}
function Process( t )
{
//alert('In Process() begining:\n"' + t + '"')
t=t.replace(/<br style="display: none;" \/>/gi, '\n');
//alert('After new-line tokens replacement:\n"' + t + '"')
//alert('After FCK:\n"' + t + '"')

//RegExp patterns for:
var patterns = new Array ( 
		  pat4tags('nowiki'),		// < nowiki >
		  pat4tags('pre'),		// < pre >
		  pat4tags('math'),		// < math >
		  pat4tags('gallery'),		// < gallery >
		  "\\{\\{(.|\\r|\\n)+?\\}\\}",		// templates {{ }}
		  "(\\[\\[)(.*?)(\\||\\]\\])",		// links [[ ]] (or [[ | )
		  '(=)(\\s?)(\\")(.*?)(\\")',		// attributes in quote ( =" )
		  "<([^>]*?)>",			// other tags
		  "^( )(.+)$"			// lines that start with space
		);

// We have 3 more pairs of safe chars in \x1A \x1F !
var save_pair = new Array( 
		   "\x01\x02",
		   "\x03\x04",
		   "\x05\x06",
		   "\x0E\x0F",
		   "\x10\x11",
		   "\x12\x13",
		   "\x14\x15",
		   "\x16\x17",
		   "\x18\x19"
);

// Buffer for replaced matches storage. It is list of arrays  of matching substrings.
var matches = new Array();

// RegExp template to be replaced (multiline, case sensitive)
var re = '';

// Convert html representation to wikitext
t = Html2Wiki ( t );

//======================================
// Replace all occurances of RegExp patterns from ''patterns'' array in t
// with matches counter surraunded by the pair of "safe" chars  ('\x03'+1'+'\x04')
//======================================

for (var i in patterns)
	{
	var counter = 0; //matches counter
	
	if ( i == 3) // Run functions before wikilinks replacement
	{
	//alert('Before NS:' + t)
	// Process default namespaces
	t = ProcessNS( t, wmEnNS , wmLocaleNS );
	//alert('After NS:' + t)
	t = CorrectRanges( t );
	//alert('After Ranges:' + t)
	}
	
	if ( i == patterns.length - 1) // Remove the first space before the spaced lines processing
		{
		// Exclude lines starting with space
		f_space = t.substr (0, 1);
  		t = t.substr (1, t.length-1);
  		
  		}
	//alert("i="+i+'\nt='+t)
	// add the matches we've found (global multi-line case-insansitive) to common array
	matches[i] = t.match( new RegExp( patterns[i] , "gim") );
	
	// create new non-global but multiline RegExp
	re = new RegExp( patterns[i] , "im");

	// while some substring of text matches with given RegExp ...
	// replace the occurance with the matches counter 
	// surraunded by the pair of "safe" chars  ('\x03'+1'+'\x04')
	while (re.test(t)) t = t.replace( re, save_pair[i][0] + ++counter + save_pair[i][1] );
	}
// Restore the first space that was delated before the spaced lines processing
t = f_space + t;
// Do the Typography staff
t = ProcessTypography( t );


//======================================
// Restore damaged text by replacing 2 "save" chars and number between them with substring from array
//======================================

for ( ; i > -1; i-- )
	{
	var counter = 0; //reset matches counter
	
	// create new non-global but multiline RegExp
	re = new RegExp( pat4chars( save_pair[i] ) , 'm' );
	
	// while some substring of text matches with given RegExp ...
	// replace 2 "save" chars and number between them with substring from matches array
	while ( re.test(t) ) t = t.replace( re, matches[i][counter++] );
	}
// Unescape text between < pre > tags

//alert('Match for < pre >:\n' + t.match(/(<pre>)((?:.|\s)+?)(<\/pre>)/gim) )
t = t.replace( /(<pre>)((?:.|\s)+?)(<\/pre>)/gim, UnEscapeInPre);

//remove space from the line's end
t = t.replace(/^(.*)\s*&/gm, '$1');

alert(t)
return t;

}

function Html2Wiki( t )
{

 function doExtLinks($0, $1, $2, $3)
 {
 //alert($0 + ', ' + $1 + ', ' + $2 + ', ' + $3)
 if ($1 !== $3) return '['+$1+' '+$3+']'
 return $1;
 }
// Replace html representation of wikitags with wikitext
//alert(t)

t = ProcessImages( t );

// Process the categories
t = t.replace( RegExp('<span dir="ltr" style="display:none"><a href="'+wmIntLinkPat+'(?:.+?)" title="(Category|'+wmCategoryNS+'):(.+?)">\\2<\\/a><\\/span>' , 'gim' ), '[[$1:$2]]');
t = t.replace( RegExp('<div id="catlinks"><p class="catlinks">(?:.|\s)+?</p></div>' , 'gim'), '')


// Make internal links ( [[...]] ) from html
t = t.replace( RegExp('<a href="'+wmIntLinkPat+'(?:.+?)" title="(.+?)">\\1<\\/a>', 'gim') , '[[$1]]')
t = t.replace( RegExp('<a href="'+wmIntLinkPat+'(?:.+?)" title="(.+?)">\\1([a-zа-яё]*)<\\/a>', 'gim') , '[[$1]]$2')
t = t.replace( RegExp('<a href="'+wmIntLinkPat+'(?:.+?)" title="(.+?)">(.+?)<\\/a>', 'gim') , '[[$1|$2]]')


// Make external links ( [...] ) from html
t = t.replace( RegExp('<a href="(.+?)"(?:.*?)>\[\d+\]<\\/a>', 'gim') , '[$1]' )
t = t.replace( RegExp('<a href="(.+?)"(.*?)>(.+?)<\\/a>', 'gim') , doExtLinks )


// Replace html headers <H?> with equal signs ={?}
t = t.replace( /^<h([1-6])>(.+)<\/h\1>/gim, pat4heads );

// Process linebreaks
t = t.replace(/<p>(\s)*<br.*>/gim, '\n');//'\n\n');

// Replace paragraphs
t = t.replace(/^<\/p>\n/gim, '');
t = t.replace(/<\/?p\s*>/gim, '');

// Replace <b>, <strong> tags with ''' and <i>, <em> with ''
t = t.replace(/\<\/?(b|strong)\>/gim, "\'\'\'");
t = t.replace(/\<\/?(i|em)\>/gim, "\'\'");

// Replace <hr> tag with ----, improve <hr> and <br> tags
t = t.replace(/\<hr ?\/?\>/gi, "----");
t = t.replace(/\<hr ([^\>\/]+?) ?\/?\>/gi, "<hr $1 />");

t = t.replace(/\<br ?\/?\>/gi, "<br/>");
t = t.replace(/\<br ([^\>\/]+?) ?\/?\>/gi, "<br $1 />");


t = ProcessTables( t );
t = ProcessLists( t );

/**///alert('After html2wiki:\n"'+t+'"');

return t;
}
//**********************************************************

function ProcessTables( t )
{
//alert("Before tables: \n"+t);

t = t.replace(/ *\<table ?\>/gim, "{|");
t = t.replace(/ *\<table ?([^\/]+?) ?\>/gim, "{| $1");

t = t.replace(/ *\<caption ?\>(.*)\<\/caption ?\>/gim, "|+ $1");

t = t.replace(/ *\<\/?tbody ?([^\/]+?)? ?\>\n?/gim, "");

t = t.replace(/ *\<\/table ?\>/gim, "|}");

//ProcessRows();
//alert("Before rows: \n"+t);

t = t.replace(/ *\<tr ?\>/gim, "|-");
t = t.replace(/ *\<tr ?([^\/]+?) ?\>/gim, "|- $1");

t = t.replace(/ *\<\/tr ?\>\n/gim, "");

//ProcessCells();
//alert("Before cells: \n"+t);

t = t.replace(/ *\<th ?\>/gim, "!");
t = t.replace(/ *\<th ?([^\/]+?) ?\>/gim, "! $1 |");

t = t.replace(/ *\<\/th ?\>/gim, "");

t = t.replace(/ *\<td ?\>/gim, "|");
t = t.replace(/ *\<td ?([^\/]+?) ?\>/gim, "| $1 |");

t = t.replace(/ *\<\/td ?\>/gim, "");

//alert("After all: \n"+t);
return t;
}

function ProcessLists( t )
{
//alert('"'+t+'"');
var lines = t.split('\n');
t='';

var char = new Array ('*', '#', ';', ':');  
var opening = new Array (/<ul>\s*<li>/gi, /<ol>\s*<li>/gi, /<dl>\s*<dt>/gi, /<dl>\s*<dd>/gi);
var items = new Array (/<\/li>\s*<li>/gi, /<\/dt>\s*<dt>/gi, /<\/dd>\s*<dt>/gi);
var closing = new Array (/<\/li>\s*<\/ul>/gi, /<\/li>\s*<\/ol>/gi, /<\/dt>\s*<\/dl>/gi, /<\/dd>\s*<\/dl>/gi);

var colons = new Array (/<\/dt>\s*<dd>/gi, /<\/dd>\s*<dd>/gi);


//********var DT = false;
var prefix = '';
var pref2 = '';

for (var l in lines)
  {
  line = lines[l];
  
  for (var i = 0; i < char.length; i++)
    {
    
    //open lists
    
    if  (opening[i].test(line))
      {
      
      //add the symbol of the list when starting new one
      prefix += char[i];

      line = line.replace(opening[i], prefix);
      }
    
    
    //continue lists
    
    if  ( i < 3 ) // avoid non-existant errors
      {
      // next list item found
      
      if (items[i].test(line)) 
        {
        
        //decrease 2nd preffix if </dd><dt> is found
        if ( i == 2 ) pref2 = pref2.substr (0, pref2.length-1); 

        line = line.replace(items[i], prefix);
        };
      
      //if dl continues and has colons...
      
      if ( i != 0 && colons[i-1].test(line))
        {
        pref2 += ':';
        line = line.replace(colons[i-1], pref2);
        }
        
      }
    
    
    //close lists
    
    if  (closing[i].test(line))
      {
      
      prefix = prefix.substr (0, prefix.length-1);
      
      //alert('"'+line+'"');
      //hacky trick to delete blank lines
      line = line.replace(closing[i], '\x1A');
      
      //line = line.replace(closing[i], '');

      
      //decrease 2nd preffix if </dd><dl> is found
      if ( i == 3 ) pref2 = pref2.substr (0, pref2.length-1); 
      
      }
    
    }
    
    //add current line to txt
    t += line;
    
    
    //if the line isn't the last - add line break
    if (lines[++l] != undefined) t += '\n';

    //alert('Line: \''+line+'\'\n Full text:\n"""'+t+'"""');
    
  }

t = t.replace(/^\x1A\n?/gm, '');

//alert(t);
//remove space from the line's end
t = t.replace(/^(.*)\s*&/, '$1');


return t;
}

function ProcessImages( t )
{
//[[Image:Test.PNG|250px|frame|center|Caption]]
//=
//<div class="center"><div class="thumb tnone"><div style="width: 810px;"><a href="/index.php/Image:Test.PNG" class="internal" title="Caption"><img src="/images/9/9f/Test.PNG" alt="Caption" longdesc="/index.php/Image:Test.PNG" height="581" width="808"></a><div class="thumbcaption">Caption</div></div></div></div>

var img= RegExp(

//if center

"(?:<div class=['\"](center)['\"]>[ \n\r]*)?" + //$1 - center|undef

// if frame or float
"(?:<div class=['\"]" +

// if frame  use "thumb t" + "align"
// else if simple align (without frame) - use "float" + "align"

//$2 - thumb t|float|undef
//$3 - right|left|none|undef

"(?:(thumb t|float)(right|left|none))?['\"]>[ \n\r]*" +
	
	//if frame (div) or align (span)
	// simple div for image (+ witdth for thumbnail if ['\"]thumb['\"] was specified)
	"(?:<div.*?>|<span>)?[ \n\r]*" +
")?" +
		//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		//always
		
		// $4 - ImageName		
		// $5 - Img caption (if was set)
		// $6 - Img width (if was set)
		
		"[ \n\r]*<a href=['\"].+?:(.+?)['\"].*?>[ \n\r]*<img .*?src=['\"].+?['\"]"  +
		
		"(?: alt=['\"](.+?)['\"]|" +
		//" longdesc=['\"].+?:(.+?)['\"]|" +
		" width=['\"](.+?)['\"]|" +
		".+?)*?" +
		
		">[ \n\r]*</a>" +
		
		//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		
		//only if frame !!!
		"(?:[ \n\r]*<div.*?>" + 
			// only if frame && thumb !!!
			"(?:[ \n\r]*<div(?:.|\n|\r)*?</div>)?" +
		"(?:.|\n|\r)*?</div>)?" +
		
	
	//if frame or align !!
	
	// simple div for image (+ witdth for thumbnail if 'thumb' was specified)
	// or span if align only
	"[ \n\r]*(?:</div>|</span>)?" +
"(?:[ \n\r]*</div>)?" + //thumb or float
".*?" +
"(?:[ \n\r]*</div>)?" // center

,
'gim');

alert('Img.match:\n'+t.match( img ) );

t = t.replace( img , MakeImage);


//alert(t)

return t;
}

// $1 - center|undef
// $2 - thumb t|float|undef
// $3 - right|left|none|undef
// $4 - ImageName
// $5 - Img caption (if was set)
// $6 - Img width (if was set)


function MakeImage( $0, $1, $2, $3, $4, $5, $6)
{
/*
alert(	$1+", "+$2+", "+$3+"\n" +
	$4+", "+$5+", "+$6+"\n" )
*/
var t;
t='[['+wmImageNS+':'+$4

//go thoug args keeping an order if possible
if ($1) t+='|'+$1;
if ($2 == 'thumb t') t+='|'+'thumbnail';
if ($3) t+='|'+$3;
if ($6) t+='|'+$6+'px';
if ($5) t+='|'+$5;
//caption must be the last

t+=']]';

//alert(t);
return t;
}
//***********************************************************
// Process default namespaces

function ProcessNS( t, En_NS_List , Loc_NS_List )
{

for (i=0; i < En_NS_List.length; i++)
  {
  var pat = "(\\[\\[:?)(?:" + En_NS_List[i] + "|" +
  Loc_NS_List[i] + "):(.*)";
  
  var re = new RegExp( pat , "gi" )
  
  //alert(t.match(re) + '\n' + pat);
  
  t = t.replace( re , "$1" + Loc_NS_List[i] + ":$2");

  }
return t;
}

function UnEscapeInPre( $0, $1, $2, $3 )
{
function html_chars(s) { return s.replace(/&lt;/g,"<").replace(/&gt;/g,">").replace(/&amp;/g,"&") }
//alert('"' + $0 + '", "' + $1 + '", "' + $2 + '", "' + $3 + '"');
//alert('"' + $1 + '", "' + html_chars($2) + '", "' + $3 + '"');

return $1+ html_chars($2) + $3;
}

//***********************************************************
// RegExp pattern for given tag
function pat4tags( tag_name )
{
return "\\<" + tag_name + "\\>(.|\r|\n)+?\\<\\/" + tag_name + "\\>";
}

//***********************************************************
// RegExp pattern for "save" pair of chars
function pat4chars( save_chars )
{
return "\\" + save_chars[0] + "([0-9]*)\\" + save_chars[1];
}

//***********************************************************
// Pattern for the string that replaces html headers with equal signs
function pat4heads( $0, $1, $2 )
{
//alert( $0+', '+$1+', '+$2 )
var t=strcopy( '=', $1);
return t+$2+t;
}

//***********************************************************
// Copy given str n times
function strcopy( str, n )
{
pat = str;
for (i=1; i<n; i++) str += pat
return str;
}

//***********************************************************
// Corrects year and century ranges in text
function CorrectRanges( t ) {
// Correct year ranges
t = t.replace(/(\(|\s)(\[\[[12]?\d{3}\]\])[\u00A0 ]?(-|--|–|—) ?(\[\[[12]?\d{3}\]\])(\W)/g, "$1$2—$4$5")
t = t.replace(/(\[\[[12]?\d{3}\]\]) ?(г\.|гг\.)/g, "$1\u00A0$2")
// Correct century ranges
t = t.replace(/(\(|\s)(\[\[[IVX]{1,5}\]\])[\u00A0 ]?(-|--|–|—) ?(\[\[[IVX]{1,5}\]\])(\W)/g, "$1$2—$4$5")
t = t.replace(/(\[\[[IVX]{1,5}\]\]) ?(в\.|вв\.)/g, "$1\u00A0$2")
return t;
}

//***********************************************************
// Prototype of ProcessTypography( t ) function

function ProcessTypography( t )
{
return t;
}

//