Jump to content

Module:Diff

From Meta, a Wikimedia project coordination wiki
Module documentation

Provides functions for diffing text.

Usage

[edit]

Ciaran Hope (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.

Difference in words

[edit]
{{TextDiff
| 1 = Ciaran Hope (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.
| 2 = Ciaran Hope (was given birth to on 20 August last year) is an Irish composter of orchestral, oral, and film music dance moves.
}}

Result:

[[w:Ciaran Hope|Ciaran Hope]] (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.
+
[[w:Ciaran Hope|Ciaran Hope]] (was given birth to on 20 August last year) is an Irish composter of orchestral, oral, and film music dance moves.

Difference in characters

[edit]
{{StringDiff
| 1 = Ciaran Hope (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.
| 2 = Ciaran Hope (was given birth to on 20 August last year) is an Irish composter of orchestral, oral, and film music dance moves.
}}

Result:

[[w:Ciaran Hope|Ciaran Hope]] (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.
+
[[w:Ciaran Hope|Ciaran Hope]] (was given birth to on 20 August last year) is an Irish composter of orchestral, oral, and film music dance moves.

Example with #invoke

[edit]
{{#invoke:Diff|main
| 1 = Ciaran Hope (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.
| 2 = Ciaran Hope (was given birth to on 20 August last year) is an Irish composter of orchestral, oral, and film music dance moves.
}}

Result:

[[w:Ciaran Hope|Ciaran Hope]] (born 4 August 1974) is an Irish composer of orchestral, choral, and film music.
+
[[w:Ciaran Hope|Ciaran Hope]] (was given birth to on 20 August last year) is an Irish composter of orchestral, oral, and film music dance moves.

-----------------------------------------------------------------------------
-- Provides functions for diffing text.
--
-- (c) 2007, 2008  Yuri Takhteyev (yuri@freewisdom.org)
-- (c) 2007 Hisham Muhammad
-- Adapted to MediaWiki LUA: [[User:Ebraminio]] <ebrahim -at- gnu.org>
--
-- License: MIT/X, see http://sputnik.freewisdom.org/en/License
-----------------------------------------------------------------------------

SKIP_SEPARATOR = true  -- a constant

IN   = "in"; OUT  = "out"; SAME = "same"  -- token statuses

-----------------------------------------------------------------------------
-- Split a string into tokens.  (Adapted from Gavin Kistner's split on
-- http://lua-users.org/wiki/SplitJoin.
--
-- @param text           A string to be split.
-- @param separator      [optional] the separator pattern (defaults to any
--                       white space - %s+).
-- @param skip_separator [optional] don't include the sepator in the results.     
-- @return               A list of tokens.
-----------------------------------------------------------------------------
function split(text, separator, skip_separator)
   separator = separator or "%s+"
   local parts = {}  
   local start = 1
   local split_start, split_end = mw.ustring.find(text, separator, start)
   while split_start do
      table.insert(parts, mw.ustring.sub(text, start, split_start-1))
      if not skip_separator then
         table.insert(parts, mw.ustring.sub(text, split_start, split_end))
      end
      start = split_end + 1
      split_start, split_end = mw.ustring.find(text, separator, start)
   end
   if mw.ustring.sub(text, start) ~= "" then
      table.insert(parts, mw.ustring.sub(text, start))
   end
   return parts
end


-----------------------------------------------------------------------------
-- Derives the longest common subsequence of two strings.  This is a faster
-- implementation than one provided by stdlib.  Submitted by Hisham Muhammad. 
-- The algorithm was taken from:
-- http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_subsequence
--
-- @param t1             the first string.
-- @param t2             the second string.
-- @return               the least common subsequence as a matrix.
-----------------------------------------------------------------------------
function quick_LCS(t1, t2)
   local m = #t1
   local n = #t2

   -- Build matrix on demand
   local C = {}
   local setmetatable = setmetatable
   local mt_tbl = {
      __index = function(t, k)
         t[k] = 0
         return 0
      end
   }
   local mt_C = {
      __index = function(t, k)
         local tbl = {}
         setmetatable(tbl, mt_tbl)
         t[k] = tbl
         return tbl
      end
   }
   setmetatable(C, mt_C)
   local max = math.max
   for i = 1, m+1 do
      local ci1 = C[i+1]
      local ci = C[i]
      for j = 1, n+1 do
         if t1[i-1] == t2[j-1] then
            ci1[j+1] = ci[j] + 1
         else
            ci1[j+1] = max(ci1[j], ci[j+1])
         end
      end
   end
   return C
end



-----------------------------------------------------------------------------
-- Formats an inline diff as HTML, with <ins> and <del> tags.
-- 
-- @param tokens         a table of {token, status} pairs.
-- @return               an HTML string.
-----------------------------------------------------------------------------
function format_as_html(tokens)
   local diff_buffer = ""
   local token, status
   for i, token_record in ipairs(tokens) do
      token = mw.text.nowiki(token_record[1])
      status = token_record[2]
      if status == "in" then
         diff_buffer = diff_buffer..'<ins>'..token..'</ins>'
      elseif status == "out" then
         diff_buffer = diff_buffer..'<del>'..token..'</del>'
      else 
         diff_buffer = diff_buffer..token
      end
   end
   return diff_buffer
end

-----------------------------------------------------------------------------
-- Returns a diff of two strings as a list of pairs, where the first value
-- represents a token and the second the token's status ("same", "in", "out").
--
-- @param old             The "old" text string
-- @param new             The "new" text string
-- @param separator      [optional] the separator pattern (defaults ot any
--                       white space).
-- @return               A list of annotated tokens.
-----------------------------------------------------------------------------
function diff(old, new, separator)
   assert(old); assert(new)
   new = split(new, separator); old = split(old, separator)

   -- First, compare the beginnings and ends of strings to remove the common
   -- prefix and suffix.  Chances are, there is only a small number of tokens
   -- in the middle that differ, in which case  we can save ourselves a lot
   -- in terms of LCS computation.
   local prefix = "" -- common text in the beginning
   local suffix = "" -- common text in the end
   while old[1] and old[1] == new[1] do
      local token = table.remove(old, 1)
      table.remove(new, 1)
      prefix = prefix..token
   end
   while old[#old] and old[#old] == new[#new] do
      local token = table.remove(old)
      table.remove(new)
      suffix = token..suffix
   end

   -- Setup a table that will store the diff (an upvalue for get_diff). We'll
   -- store it in the reverse order to allow for tail calls.  We'll also keep
   -- in this table functions to handle different events.
   local rev_diff = {
      put  = function(self, token, type) table.insert(self, {token,type}) end,
      ins  = function(self, token) self:put(token, IN) end,
      del  = function(self, token) self:put(token, OUT) end,
      same = function(self, token) if token then self:put(token, SAME) end end,
   }

   -- Put the suffix as the first token (we are storing the diff in the
   -- reverse order)

   rev_diff:same(suffix)

   -- Define a function that will scan the LCS matrix backwards and build the
   -- diff output recursively.
   local function get_diff(C, old, new, i, j)
      local old_i = old[i]
      local new_j = new[j]
      if i >= 1 and j >= 1 and old_i == new_j then
         rev_diff:same(old_i)
         return get_diff(C, old, new, i-1, j-1)
      else
         local Cij1 = C[i][j-1]
         local Ci1j = C[i-1][j]
         if j >= 1 and (i == 0 or Cij1 >= Ci1j) then
            rev_diff:ins(new_j)
            return get_diff(C, old, new, i, j-1)
         elseif i >= 1 and (j == 0 or Cij1 < Ci1j) then
            rev_diff:del(old_i)
            return get_diff(C, old, new, i-1, j)
         end
      end
   end
   -- Then call it.
   get_diff(quick_LCS(old, new), old, new, #old + 1, #new + 1)

   -- Put the prefix in at the end
   rev_diff:same(prefix)

   -- Reverse the diff.
   local diff = {}

   for i = #rev_diff, 1, -1 do
      table.insert(diff, rev_diff[i])
   end
   diff.to_html = format_as_html
   return diff
end

-----------------------------------------------------------------------------
-- Wiki diff style, currently just for a line
-----------------------------------------------------------------------------
function wikiDiff(old, new, separator)
  local tokens = diff(old, new, separator)
  local root = mw.html.create('')

  local token, status

  local plusMinusStyle = 'width: 2%; padding: 0.25em; font-weight: bold;' ..
    'font-size: 1.25em; text-align: end;'
  local tdDivStyle = 'word-wrap: break-word; direction: ltr;'

  local tdSharedStyle = 'width: 48%; border-style: solid; border-radius: 0.33em; ' ..
    'padding: 0.33em 0.5em; font-size: 88%; border-width: 1px 1px 1px 4px; ' ..
    '-webkit-border-end-width: 1px; -webkit-border-start-width: 4px; ' ..
    '-moz-border-end-width: 1px; -moz-border-start-width: 4px;' -- these overrides default border-width for browsers supports them, needed for RTL UI on commons
  local insDelSharedStyle = 'padding: 0.25em 0; font-weight: bold; text-decoration: initial;'


  local tr = root:tag('table'):addClass('diff'):css('width', '100%'):tag('tr')

  tr:tag('td')
    :addClass('diff-marker')
    :cssText(plusMinusStyle)
    :wikitext('−')

  local deleted = tr
    :tag('td')
      :cssText('border-color: #ffe49c; ' .. tdSharedStyle)
      :addClass('diff-deletedline')
      :tag('div')
        :cssText(tdDivStyle)

  for i, token_record in ipairs(tokens) do
    token = mw.text.nowiki(token_record[1])
    status = token_record[2]
    if status == OUT then
      deleted
        :tag('del')
          :cssText('background: #feeec8; ' .. insDelSharedStyle)
          :addClass('diffchange')
          :addClass('diffchange-inline')
          :wikitext(token)
    elseif status == SAME then
      deleted:wikitext(token)
    end
  end

  tr:tag('td')
    :cssText(plusMinusStyle)
    :wikitext('+')

  local inserted = tr
    :tag('td')
      :cssText('border-color: #a3d3ff; ' .. tdSharedStyle)
      :addClass('diff-addedline')
      :tag('div')
        :cssText(tdDivStyle)

  for i, token_record in ipairs(tokens) do
    token = mw.text.nowiki(token_record[1])
    status = token_record[2]
    if status == IN then
      inserted
        :tag('ins')
          :cssText('background: #d8ecff; ' .. insDelSharedStyle)
          :addClass('diffchange')
          :addClass('diffchange-inline')
          :wikitext(token)
    elseif status == SAME then
      inserted:wikitext(token)
    end
  end

  return tostring(root)
end

function main(frame)
  return wikiDiff(mw.text.unstrip(mw.text.decode(frame.args[1])), mw.text.decode(mw.text.unstrip(frame.args[2])), frame.args[3] or '[%s%.:-]+')
end

return {
  diff = diff,
  wikiDiff = wikiDiff,
  main = main
}