Jump to content

User:Tbayer (WMF)/Converting wiki pages to plaintext emails

From Meta, a Wikimedia project coordination wiki

This is a Python script to convert a page on Meta-wiki into plaintext suitable for posting on mailing lists, e.g. [1] to [2]. I quickly it hacked together in order to save time when posting the monthly Wikimedia Foundation reports on WikimediaAnnounce-l and it works fine for me (with only minimal manual cleanup remaining, most of which the script notifies one about). But the code isn't super elegant and I haven't tested it with anything else.

To use it, first open the wiki page in Firefox and save it in text file format. (Other browsers or HTML-to-plaintext converters might work too, but Chrome doesn't, because of the way it modifies internal links, and I seem to recall that with IE there were problems with line breaks.) Then convert it using the script (Python needs to be installed):

python wikihtmltxt2plaintext.py wikipage.txt wikipagefixed.txt

where wikihtmltxt2plaintext.py is:

# Quick-and-dirty script to convert a Meta-wiki page
# into plaintext for use in non-HTML emails
# Takes a wiki page saved from Firefox in text file format,
# and applies various fixes to it, mainly fixing links and
# restoring wikitext-style section headings.
# Media files included or linked on the page are assumed to be hosted on Commons.
# Unpolished alpha version ;)
# By T. Bayer ([[user:HaeB]])
import os
import sys
import re
import codecs
from collections import deque

debug = True # print every change made while converting
# debug = False # don't print changes

usageexplanation = 'usage: wikihtmltxt2plaintext.py inputfile outputfile , where inputfile is a Meta-wiki page saved under Firefox (not Chrome) as txt file, and outputfile should become a plaintext file suitable for posting on mailing lists'

class wikihtmltxt2plaintextError(Exception):
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return repr(self.value)

if not len(sys.argv) == 3:
    raise wikihtmltxt2plaintextError(usageexplanation)

def convertline(m):
    # convert internal file links to full URLs pointing to Wikimedia Commons :
    old = re.escape('</wiki/File:')
    new = '<Image: https://commons.wikimedia.org/wiki/File:'
    m = re.sub(old, new, m)
    # convert non-file internal links to external links :
    old = re.escape('</wiki/')
    new = '<https://meta.wikimedia.org/wiki/'
    m = re.sub(old, new, m)
    # remove anchor links (within the same page):
    old = r'\ ?\<\#[^\>]*>'
    if re.search(old, m):
        m = re.sub(old, '', m)
        # remove the line if its only content was such anchor links:
        if not m.strip():
            m = '' #  or: "continue"
    # convert protocol-relative links:
    old = re.escape('<//')
    new = '<https://'
    m = re.sub(old, new, m)

    if re.search(' WebM 360P', m):
        print 'Warning: There may be garbage text resulting from an embedded HTML5 video. Search for:'
        print '"'+m.rstrip()+'"\n'

inputfilename = sys.argv[1]
outputfilename = sys.argv[2]

inputfile = codecs.open(inputfilename, mode='r', encoding='utf-8')
outputfile = codecs.open(outputfilename, mode='w', encoding='utf-8')

buf = deque()

line = inputfile.readline()   # read first line

while line:
    # look at the end of buffer,
    # remove section edit links, add '==...==' to mark section headings
    editlinkfirstline = '\ *' + re.escape('[edit')+'$'
    if re.match('.*'+editlinkfirstline, buf[-1]):
        # Looks like we found the beginning of a section edit link...
        # ... check whether the subsequent line contains the rest of the edit link:
        line = inputfile.readline()
        old = '[\ ]*'+re.escape('   </w/index.php?title=')+'[^\&]*'+re.escape('&action=edit&section=')+'[^\>]*\>\]$'
        if re.match(old, line):
            buf[-1] = re.sub(editlinkfirstline, '', buf[-1])  # discard first line part of edit link, too
            if re.match(r'\ *$', buf[-1]): # line consisted only of blanks + first line part of edit link
                buf.pop()   # discard empty line
            buf[-1] = buf[-1].rstrip()  # remove line break temporarily
            # read back to get entire section title (long ones might be broken over several lines),
            leadingblanks = re.match(r'\ *', buf[-1]).group()
            while leadingblanks == re.match(r'\ *', buf[-2]).group():
                # if subsequent lines have the same number of leading blanks,
                # assume they are still part of the section heading:
                buf[-1] = buf[-2].rstrip() + re.sub(r'\ *', ' ', buf.pop(), 1)
            buf[-1] = ' ' + buf[-1] + ' '  # add single spaces for legibility (e.g. "== Notes ==" instead of "==Notes=="
            while re.match('[\=]*  ', buf[-1]):
                buf[-1] = re.sub(r'  ', '=', buf[-1],1)+'='  # hack, depends on #(leading blanks) = 2*sectionlevel
            buf[-1] += '\n'   #readd line break
    line = inputfile.readline()   # read next line - if we are the EOF, flush buffer:

    while (len(buf)>4 or not line) and len(buf)>0:
        m = buf.popleft()
        imagelink = re.escape('</wiki/File:')        
        if re.match(imagelink, m):
            print 'm = ' + m
            print 'buf = '
            print list(buf)
            # Hack to remove double links to images and videos
            # (both the thumbnail and the magnifying glass icon link to file description page):
            if m == buf[1] and buf[0].rstrip() == '':
            if m == buf[0]:

            m = convertline(m)
            # print heads-up about file link, because some of them are actually to e.g. videos;
            # in that case "<Image:.. " will need to be changed by hand to "<Video:.. ":
            print 'Check if image or other file type, and adjust "<Image: ..." accordingly: ', m
            m = convertline(m)


See also[edit]