Jump to content

Uploadmultiple.py/Uploadmultiple.py

From Meta, a Wikimedia project coordination wiki
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot uploads multiple files as specified in a list of files in
a separate text document ("uploadlist.txt" by default). Each file
name should be in quotation marks and comma separated with no line
breaks. For example:

--------------------------------------------------------------------
  "file1.jpg","file2.jpg","c:\docs\wiki\something.jpg"
--------------------------------------------------------------------

Available arguments:
-debug     Switch to debug mode. Files are not actually uploaded.
-file:xxx  Specify the file which contains the list of filenames to
           be uploaded.
-nodesc    Do not ask for a description for each file.
-rename    Prompt for a new filename for each file in the filelist.

It is recommended that you rename your files to the desired
destination names BEFORE running this script, for two reasons.
First, this script won't ask you for the new filename unless you
specify the -rename argument, and second, since you can't see the
image you're about to upload, it makes it tought to know which file
is which if they're all called IMG_0023, etc.

Once you have all your files renamed, you can export a list of the
files in a directory using the DOS "dir" command, like this:

--------------------------------------------------------------------
  dir /w > uploadlist.txt
--------------------------------------------------------------------

This will drop all of the filenames into a text file named
uploadlist.txt. Then, open that file with a text editor and do some
quick find-and-replaces to trim it down to just a list of filenames,
each in quotation marks and comma separated, as shown above. If
there are extra files in the directory which you don't want uploaded
then be sure to edit them out of the file at this time.

Then execute this script.

This script is a combination of pagefromfile.py and upload.py by
Andre Engles (2004). This is my first attempt at doing anything in
Python so I know my code sucks donkey balls. But it does the
job. YMMV.

TODO:
  - check fileexists before trying to upload
  - import directory recursively instead of from uploadlist.txt
  - clean up crappy code

"""
#
# (C) Jonathan Kovaciny, 2007
#
# Distributed under the terms of the MIT license.
#

__version__='$Id: Exp $'

import wikipedia, config, mediawiki_messages
import os, re, sys, codecs
import urllib, httplib


# Note: these are not the real translations
msg={
    'de': u'Automatischer upload',
    'en': u'Automated upload of files',
    'fr': u'Upload automatique',
    'ia': u'Upload automatic',
    'id': u'Upload automatis',
    'it': u'Upload automatico',
    'ksh': u'Automatesch upload',
    'nl': u'Automatis upload',
    'pl': u'Automatyczny upload',
    'pt': u'Upload ção automática'
    }

msg_force={
    'en': u'existing image overwritten',
    'he': u'הטקסט הקיים נדרס',
    'fr': u'texte existant écrasé',
    'id': u'menimpa teks yang ada',
    'it': u'sovrascritto il testo esistente',
    'ksh': u'Automatesch ußjetuusch',
    'nl': u'bestaande tekst overschreven',
    'pl': u'aktualny tekst nadpisany',
    'pt': u'sobrescrever texto'
    }

def post_multipart(site, address, fields, files, cookies):
    """
    Post fields and files to an http host as multipart/form-data.
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return the server's response page.
    """
    contentType, body = encode_multipart_formdata(fields, files)
    return site.postData(address, body, contentType = contentType)

def encode_multipart_formdata(fields, files):
    """
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return (content_type, body) ready for httplib.HTTP instance
    """
    boundary = '----------ThIs_Is_tHe_bouNdaRY_$'
    lines = []
    for (key, value) in fields:
        lines.append('--' + boundary)
        lines.append('Content-Disposition: form-data; name="%s"' % key)
        lines.append('')
        lines.append(value)
    for (key, filename, value) in files:
        lines.append('--' + boundary)
        lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
        lines.append('Content-Type: %s' % get_content_type(filename))
        lines.append('')
        lines.append(value)
    lines.append('--' + boundary + '--')
    lines.append('')
    body = '\r\n'.join(lines)
    content_type = 'multipart/form-data; boundary=%s' % boundary
    return content_type, body

def get_content_type(filename):
    import mimetypes
    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'


class UploadRobot:
    def __init__(self, url, description = u'', keepFilename = False, verifyDescription = True, ignoreWarning = False, targetSite = None, urlEncoding = None):
        """
        ignoreWarning - Set this to True if you want to upload even if another
                        file would be overwritten or another mistake would be
                        risked.
                        Attention: This parameter doesn't work yet for unknown reason.
        """
        self.url = url
        self.urlEncoding = urlEncoding
        self.description = description
        self.keepFilename = keepFilename
        self.verifyDescription = verifyDescription
        self.ignoreWarning = ignoreWarning
        if config.upload_to_commons:
            self.targetSite = targetSite or wikipedia.getSite('commons', 'commons')
        else:
            self.targetSite = targetSite or wikipedia.getSite()
        self.targetSite.forceLogin()

    def urlOK(self):
        '''
        Returns true iff the URL references an online site or an
        existing local file.
        '''
        return self.url != '' and ('://' in self.url or os.path.exists(self.url))

    def upload_image(self, debug=False):
        skipthisfile = False
        """Gets the image at URL self.url, and uploads it to the target wiki.
           Returns the filename which was used to upload the image.
           If the upload fails, the user is asked whether to try again or not.
           If the user chooses not to retry, returns null.
        """
        # Get file contents
        if '://' in self.url:
            uo = wikipedia.MyURLopener()
            file = uo.open(self.url,"rb")
        else:
            # Opening local files with MyURLopener would be possible, but we
            # don't do it because it only accepts ASCII characters in the
            # filename.
            file = open(self.url,"rb")
        wikipedia.output(u'\n\n--------------------------------------------------------------------')
        wikipedia.output(u'\n      %s\n' % self.url)
        contents = file.read()
        if contents.find("The requested URL was not found on this server.") != -1:
            print "Couldn't download the image."
            return
        file.close()
        # Isolate the pure name
        filename = self.url
        if '/' in filename:
            filename = filename.split('/')[-1]
        if '\\' in filename:
            filename = filename.split('\\')[-1]
        if self.urlEncoding:
            filename = urllib.unquote(filename)
            filename = filename.decode(self.urlEncoding)
        if not self.keepFilename:
            # ask newfn until it's valid
            ok = False
            # FIXME: these 2 belong somewhere else, presumably in family
            forbidden = '/' # to be extended
            allowed_formats = (u'gif', u'jpg', u'jpeg', u'mid', u'midi', u'ogg', u'png', u'svg', u'xcf')
            while not ok:
                ok = True
                newfn = wikipedia.input(u'Type the new filename for this file and press Enter. Leave blank and\npress Enter to keep the current filename. To skip uploading this\nfile, type \"skip\" and press Enter. To abort all uploads, type \"quit\"\nand press Enter.\n\nFilename:')
                if newfn == "":
                    newfn = filename
                elif newfn == "skip":
                    wikipedia.output(u'\nFILE SKIPPED')
                    skipthisfile = True
                elif newfn == "quit":
                    wikipedia.output(u'\nUPLOAD CANCELED')
                    sys.exit()
                if newfn != "skip":
                    ext = os.path.splitext(newfn)[1].lower().strip('.')
                    for c in forbidden:
                        if c in newfn:
                            print "Invalid character: %s. Please try again" % c
                            ok = False
                    if ext not in allowed_formats and ok:
                        choice = wikipedia.inputChoice(u"File format is not one of [%s], but %s. Continue?" % (u' '.join(allowed_formats), ext), ['yes', 'no'], ['y', 'N'], 'N')
                        if choice == 'n':
                            ok = False
            if newfn != '' and newfn != "skip":
                filename = newfn
                wikipedia.output(u'\nThis file will be named \" %s \" when uploaded.\n\n' % newfn)
        # MediaWiki doesn't allow spaces in the file name.
        # Replace them here to avoid an extra confirmation form
        filename = filename.replace(' ', '_')
        # Convert the filename (currently Unicode) to the encoding used on the
        # target wiki
        encodedFilename = filename.encode(self.targetSite.encoding())
        # A proper description for the submission.
        # wikipedia.output(u"The suggested description is:")
        # wikipedia.output(self.description)
        if self.verifyDescription and skipthisfile == False:
                newDescription = u''
                newDescription = wikipedia.input(u'Enter a brief description for this file and press Enter. Wikitext is\nacceptable. Leave blank if no description is desired. To use the    \nexternal editor, type \"edit\" and press Enter. To skip uploading this\nfile, type \"skip\" and press Enter. To abort all uploads, type \"quit\"\nand press Enter.\n\nDescription:')
                if newDescription == "edit":
                    import editarticle
                    editor = editarticle.TextEditor()
                    newDescription = editor.edit(self.description)
                elif newDescription == "skip":
                    wikipedia.output(u'\nFILE SKIPPED')
                    skipthisfile = True
                elif newDescription == "quit":
                    wikipedia.output(u'\nUPLOAD CANCELED')
                    sys.exit()
                if newDescription:
                    self.description = newDescription
    
        formdata = {}
        formdata["wpUploadDescription"] = self.description
    #     if self.targetSite.version() >= '1.5':
    #         formdata["wpUploadCopyStatus"] = wikipedia.input(u"Copyright status: ")
    #         formdata["wpUploadSource"] = wikipedia.input(u"Source of image: ")
        formdata["wpUploadAffirm"] = "1"
        formdata["wpUpload"] = "upload bestand"
        # This somehow doesn't work.
        if self.ignoreWarning:
            formdata["wpIgnoreWarning"] = "1"
        else:
            formdata["wpIgnoreWarning"] = "0"

        # try to encode the strings to the encoding used by the target site.
        # if that's not possible (e.g. because there are non-Latin-1 characters and
        # the home Wikipedia uses Latin-1), convert all non-ASCII characters to
        # HTML entities.
        for key in formdata:
            assert isinstance(key, basestring), "ERROR: %s is not a string but %s" % (key, type(key))
            try:
                formdata[key] = formdata[key].encode(self.targetSite.encoding())
            except (UnicodeEncodeError, UnicodeDecodeError):
                formdata[key] = wikipedia.UnicodeToAsciiHtml(formdata[key]).encode(self.targetSite.encoding())
    
        # don't upload if we're in debug mode
        if not debug and skipthisfile == False:
            wikipedia.output(u'\nUploading file to %s' % self.targetSite)
            response, returned_html = post_multipart(self.targetSite,
                                  self.targetSite.upload_address(),
                                  formdata.items(),
                                  (('wpUploadFile', encodedFilename, contents),),
                                  cookies = self.targetSite.cookies()
                                  )
            # There are 2 ways MediaWiki can react on success: either it gives
            # a 200 with a success message, or it gives a 302 (redirection).
            # Do we know how the "success!" HTML page should look like?
            # ATTENTION: if you changed your Wikimedia Commons account not to show
            # an English interface, this detection will fail!
            success_msg = mediawiki_messages.get('successfulupload', site = self.targetSite)
            if success_msg in returned_html or response.status == 302:
                 wikipedia.output(u"Upload successful.")
            # The following is not a good idea, because the server also gives a 200 when
            # something went wrong.
            #if response.status in [200, 302]:
            #    wikipedia.output(u"Upload successful.")
            
            else:
                try:
                    # Try to find the error message within the HTML page.
                    # If we can't find it, we just dump the entire HTML page.
                    returned_html = returned_html[returned_html.index('<!-- start content -->') + 22: returned_html.index('<!-- end content -->')]
                except:
                    pass
                wikipedia.output(u'%s\n\n' % returned_html)
                wikipedia.output(u'%i %s' % (response.status, response.reason))
                answer = wikipedia.inputChoice(u'Upload of %s probably failed. Above you see the HTML page which was returned by MediaWiki. Try again?' % filename, ['Yes', 'No'], ['y', 'N'], 'N')
                if answer in ["y", "Y"]:
                    return upload_image(debug)
                else:
                    return
        skipthisfile = False
        return filename

    def run(self):
        while not self.urlOK():
            if not self.url:
                wikipedia.output(u'No input filename given')
            else:
                wikipedia.output(u'Invalid input filename given. Try again.')
            self.url = wikipedia.input(u'File or URL where image is now:')
        return self.upload_image()

# TODO: make object-oriented

starttext = "\""
endtext = "\","
filename = "uploadlist.txt"
debug = False
keepFilename = True
verifyDescription = True
mysite = wikipedia.getSite()
commenttext = wikipedia.translate(mysite,msg)
contents = u''
description = u''

def findfilename(t):
    # this code sucks, but i'm too lazy to figure out how to do it right
    try:
        location = re.search(starttext+"([^\Z]*?)"+endtext,t)
        contents = location.group(1)
    except AttributeError:
        try:
            location = re.search(starttext+"([^\Z]*?)"+starttext,t)
            contents = location.group(1)
        except AttributeError:
            print 'DONE.'
            return 0
        else:
            if debug:
                print 'Debug mode: Not uploading file \" ' + contents + " \" "
            else:
                bot = UploadRobot(contents, description, keepFilename, verifyDescription)
                bot.run()
    else:
        if debug:
            print 'Debug mode: Not uploading file \" ' + contents + " \" "
        else:
            bot = UploadRobot(contents, description, keepFilename, verifyDescription)
            bot.run()
    return location.end()

def main():
    wikipedia.output(u'\nAvailable arguments:\n-debug     Switch to debug mode. Files are not actually uploaded.\n-file:xxx  Specify the file which contains the list of filenames to\n           be uploaded. \"uploadlist.txt\" is default.\n-nodesc    Do not ask for a description for each file.\n-rename    Prompt for a new filename for each file in the filelist.\n')
    text = []
    f = codecs.open(filename,'r', encoding = config.textfile_encoding)
    text = f.read()
    a = findfilename(text)
    position = a
    while a>0:
        a = findfilename(text[position:])
        position += a

args = wikipedia.handleArgs()
for arg in args:
    if arg:
        if arg=="-debug":
            wikipedia.output(u"Debug mode enabled.")
            debug = True
        elif arg.startswith('-rename'):
            keepFilename = False
        elif arg.startswith('-nodesc'):
            verifyDescription = False
        elif arg.startswith("-file:"):
            filename=arg[6:]
 
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()