(my mod)
(pre tags - new version of replace.py)
Line 1: Line 1:
My version of replace.py, I think modified sometime in 2009:
Standard replace.py, June 2010:


<pre>
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
"""
Line 65: Line 66:
                   resources. This will slow it down between a regex and another
                   resources. This will slow it down between a regex and another
                   in order not to waste too much CPU.
                   in order not to waste too much CPU.
-query:          The maximum number of pages that the bot will load at once.
                  Default value is 60. Ignored when reading an XML file.


-fix:XYZ          Perform one of the predefined replacements tasks, which are
-fix:XYZ          Perform one of the predefined replacements tasks, which are
Line 86: Line 90:
                   will be regarded as a regular expression, and the second
                   will be regarded as a regular expression, and the second
                   argument might contain expressions like \\1 or \g<name>.
                   argument might contain expressions like \\1 or \g<name>.
                  It is possible to introduce more than one pair of old text
                  and replacement.


Examples:
Examples:
Line 99: Line 105:


     python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
     python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
If you want to do more than one replacement at a time, use this:
    python replace.py -xml:foobar.xml "Errror" "Error" "Faail" "Fail" -namespace:0


If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
Line 109: Line 118:
     python replace.py referer referrer -file:typos.txt -excepttext:HTTP
     python replace.py referer referrer -file:typos.txt -excepttext:HTTP
"""
"""
from __future__ import generators
#
#
# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
# (C) Daniel Herding & the Pywikipedia team, 2004-2009
#
__version__='$Id: replace.py 7695 2009-11-26 09:28:38Z alexsh $'
#
#
# Distributed under the terms of the MIT license.
# Distributed under the terms of the MIT license.
#
#


from __future__ import generators
import sys, re, time
import sys, re, time
import wikipedia, pagegenerators
import wikipedia as pywikibot
import pagegenerators
import editarticle
import editarticle
import webbrowser
import webbrowser
Line 131: Line 143:
}
}


__version__='$Id: replace.py 6844 2009-05-07 09:27:39Z siebrand $'


# Summary messages in different languages
# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.`v
# below.
msg = {
msg = {
     'ar': u'%s روبوت : استبدال تلقائي للنص',
     'ar': u'%s روبوت : استبدال تلقائي للنص',
Line 168: Line 179:
     'sr': u'Бот: Аутоматска замена текста %s',
     'sr': u'Бот: Аутоматска замена текста %s',
     'sv': u'Bot: Automatisk textersättning: %s',
     'sv': u'Bot: Automatisk textersättning: %s',
    'uk': u'Бот: Автоматизована заміна тексту: %s',
     'zh': u'機器人:執行文字代換作業 %s',
     'zh': u'機器人:執行文字代換作業 %s',
}
}
Line 196: Line 208:


         self.excsInside = []
         self.excsInside = []
         if 'inside-tags' in self.exceptions:
         if "inside-tags" in self.exceptions:
             self.excsInside += self.exceptions['inside-tags']
             self.excsInside += self.exceptions['inside-tags']
         if 'inside' in self.exceptions:
         if "inside" in self.exceptions:
             self.excsInside += self.exceptions['inside']
             self.excsInside += self.exceptions['inside']
         import xmlreader
         import xmlreader
         self.site = wikipedia.getSite()
         self.site = pywikibot.getSite()
         dump = xmlreader.XmlDump(self.xmlFilename)
         dump = xmlreader.XmlDump(self.xmlFilename)
         self.parser = dump.parse()
         self.parser = dump.parse()
Line 216: Line 228:
                     new_text = entry.text
                     new_text = entry.text
                     for old, new in self.replacements:
                     for old, new in self.replacements:
                         new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site)
                         new_text = pywikibot.replaceExcept(new_text, old, new, self.excsInside, self.site)
                     if new_text != entry.text:
                     if new_text != entry.text:
                         yield wikipedia.Page(self.site, entry.title)
                         yield pywikibot.Page(self.site, entry.title)
         except KeyboardInterrupt:
         except KeyboardInterrupt:
             try:
             try:
                 if not self.skipping:
                 if not self.skipping:
                     wikipedia.output(
                     pywikibot.output(
                         u'To resume, use "-xmlstart:%s" on the command line.'
                         u'To resume, use "-xmlstart:%s" on the command line.'
                         % entry.title)
                         % entry.title)
Line 229: Line 241:


     def isTitleExcepted(self, title):
     def isTitleExcepted(self, title):
         if 'title' in self.exceptions:
         if "title" in self.exceptions:
             for exc in self.exceptions['title']:
             for exc in self.exceptions['title']:
                 if exc.search(title):
                 if exc.search(title):
                     return True
                     return True
         if 'require-title' in self.exceptions:
         if "require-title" in self.exceptions:
             for req in self.exceptions['require-title']:
             for req in self.exceptions['require-title']:
                 if not req.search(title): # if not all requirements are met:
                 if not req.search(title): # if not all requirements are met:
Line 241: Line 253:


     def isTextExcepted(self, text):
     def isTextExcepted(self, text):
         if 'text-contains' in self.exceptions:
         if "text-contains" in self.exceptions:
             for exc in self.exceptions['text-contains']:
             for exc in self.exceptions['text-contains']:
                 if exc.search(text):
                 if exc.search(text):
Line 286: Line 298:
             inside-tags
             inside-tags
                 A list of strings. These strings must be keys from the
                 A list of strings. These strings must be keys from the
                 exceptionRegexes dictionary in wikipedia.replaceExcept().
                 exceptionRegexes dictionary in pywikibot.replaceExcept().


         """
         """
Line 295: Line 307:
         self.allowoverlap = allowoverlap
         self.allowoverlap = allowoverlap
         self.recursive = recursive
         self.recursive = recursive
        if addedCat:
            site = pywikibot.getSite()
            self.addedCat = pywikibot.Page(site, addedCat, defaultNamespace=14)
        self.sleep = sleep
         # Some function to set default editSummary should probably be added
         # Some function to set default editSummary should probably be added
         self.editSummary = editSummary
         self.editSummary = editSummary
        if addedCat:
            site = wikipedia.getSite()
            cat_ns = site.category_namespaces()[0]
            self.addedCat = wikipedia.Page(site,
                                          cat_ns + ':' + addedCat)
        self.sleep = sleep


     def isTitleExcepted(self, title):
     def isTitleExcepted(self, title):
Line 308: Line 318:
         Iff one of the exceptions applies for the given title, returns True.
         Iff one of the exceptions applies for the given title, returns True.
         """
         """
         if 'title' in self.exceptions:
         if "title" in self.exceptions:
             for exc in self.exceptions['title']:
             for exc in self.exceptions['title']:
                 if exc.search(title):
                 if exc.search(title):
                     return True
                     return True
         if 'require-title' in self.exceptions:
         if "require-title" in self.exceptions:
             for req in self.exceptions['require-title']:
             for req in self.exceptions['require-title']:
                 if not req.search(title):
                 if not req.search(title):
Line 323: Line 333:
         returns True.
         returns True.
         """
         """
         if 'text-contains' in self.exceptions:
         if "text-contains" in self.exceptions:
             for exc in self.exceptions['text-contains']:
             for exc in self.exceptions['text-contains']:
                 if exc.search(original_text):
                 if exc.search(original_text):
Line 336: Line 346:
         new_text = original_text
         new_text = original_text
         exceptions = []
         exceptions = []
         if 'inside-tags' in self.exceptions:
         if "inside-tags" in self.exceptions:
             exceptions += self.exceptions['inside-tags']
             exceptions += self.exceptions['inside-tags']
         if 'inside' in self.exceptions:
         if "inside" in self.exceptions:
             exceptions += self.exceptions['inside']
             exceptions += self.exceptions['inside']
         for old, new in self.replacements:
         for old, new in self.replacements:
             if self.sleep is not None:
             if self.sleep is not None:
                 time.sleep(self.sleep)
                 time.sleep(self.sleep)
             new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
             new_text = pywikibot.replaceExcept(new_text, old, new, exceptions,
                                               allowoverlap=self.allowoverlap)
                                               allowoverlap=self.allowoverlap)
         return new_text
         return new_text
Line 355: Line 365:
         for page in self.generator:
         for page in self.generator:
             if self.isTitleExcepted(page.title()):
             if self.isTitleExcepted(page.title()):
                 wikipedia.output(
                 pywikibot.output(
                     u'Skipping %s because the title is on the exceptions list.'
                     u'Skipping %s because the title is on the exceptions list.'
                     % page.aslink())
                     % page.aslink())
Line 363: Line 373:
                 original_text = page.get(get_redirect=True)
                 original_text = page.get(get_redirect=True)
                 if not page.canBeEdited():
                 if not page.canBeEdited():
                     wikipedia.output(u"You can't edit page %s"
                     pywikibot.output(u"You can't edit page %s"
                                     % page.aslink())
                                     % page.aslink())
                     continue
                     continue
             except wikipedia.NoPage:
             except pywikibot.NoPage:
                 wikipedia.output(u'Page %s not found' % page.aslink())
                 pywikibot.output(u'Page %s not found' % page.aslink())
                 continue
                 continue
             new_text = original_text
             new_text = original_text
             while True:
             while True:
                 if self.isTextExcepted(new_text):
                 if self.isTextExcepted(new_text):
                     wikipedia.output(
                     pywikibot.output(
     u'Skipping %s because it contains text that is on the exceptions list.'
     u'Skipping %s because it contains text that is on the exceptions list.'
                         % page.aslink())
                         % page.aslink())
Line 378: Line 388:
                 new_text = self.doReplacements(new_text)
                 new_text = self.doReplacements(new_text)
                 if new_text == original_text:
                 if new_text == original_text:
                     wikipedia.output('No changes were necessary in %s'
                     pywikibot.output(u'No changes were necessary in %s'
                                    % page.aslink())
                                      % page.aslink())
                     break
                     break
                 if self.recursive:
                 if self.recursive:
Line 387: Line 397:
                         newest_text = self.doReplacements(new_text)
                         newest_text = self.doReplacements(new_text)
                 if hasattr(self, "addedCat"):
                 if hasattr(self, "addedCat"):
                     cats = page.categories(nofollow_redirects=True)
                     cats = page.categories()
                     if self.addedCat not in cats:
                     if self.addedCat not in cats:
                         cats.append(self.addedCat)
                         cats.append(self.addedCat)
                         new_text = wikipedia.replaceCategoryLinks(new_text,
                         new_text = pywikibot.replaceCategoryLinks(new_text,
                                                                   cats)
                                                                   cats)
                 # Show the title of the page we're working on.
                 # Show the title of the page we're working on.
                 # Highlight the title in purple.
                 # Highlight the title in purple.
                 wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                 pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                                 % page.title())
                                 % page.title())
                 wikipedia.showDiff(original_text, new_text)
                 pywikibot.showDiff(original_text, new_text)
                 if self.acceptall:
                 if self.acceptall:
                     break
                     break
                 choice = wikipedia.inputChoice(
                 choice = pywikibot.inputChoice(
                             u'Do you want to accept these changes?',
                             u'Do you want to accept these changes?',
                             ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
                             ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
Line 415: Line 425:
                         page.site().nice_get_address(page.title())
                         page.site().nice_get_address(page.title())
                     ))
                     ))
                     wikipedia.input("Press Enter when finished in browser.")
                     pywikibot.input("Press Enter when finished in browser.")
                     original_text = page.get(get_redirect=True, force=True)
                     original_text = page.get(get_redirect=True, force=True)
                     new_text = original_text
                     new_text = original_text
Line 430: Line 440:
                 try:
                 try:
                     page.put(new_text, self.editSummary)
                     page.put(new_text, self.editSummary)
                 except wikipedia.EditConflict:
                 except pywikibot.EditConflict:
                     wikipedia.output(u'Skipping %s because of edit conflict'
                     pywikibot.output(u'Skipping %s because of edit conflict'
                                     % (page.title(),))
                                     % (page.title(),))
                 except wikipedia.SpamfilterError, e:
                 except pywikibot.SpamfilterError, e:
                     wikipedia.output(
                     pywikibot.output(
                         u'Cannot change %s because of blacklist entry %s'
                         u'Cannot change %s because of blacklist entry %s'
                         % (page.title(), e.url))
                         % (page.title(), e.url))
                 except wikipedia.PageNotSaved, error:
                 except pywikibot.PageNotSaved, error:
                     wikipedia.output(u'Error putting page: %s'
                     pywikibot.output(u'Error putting page: %s'
                                     % (error.args,))
                                     % (error.args,))
                 except wikipedia.LockedPage:
                 except pywikibot.LockedPage:
                     wikipedia.output(u'Skipping %s (locked page)'
                     pywikibot.output(u'Skipping %s (locked page)'
                                     % (page.title(),))
                                     % (page.title(),))


Line 498: Line 508:
     # Do not recurse replacement
     # Do not recurse replacement
     recursive = False
     recursive = False
    # This is the maximum number of pages to load per query   
    maxquerysize = 60
     # This factory is responsible for processing command line arguments
     # This factory is responsible for processing command line arguments
     # that are also used by other scripts and that determine on which pages
     # that are also used by other scripts and that determine on which pages
Line 504: Line 516:
     # Load default summary message.
     # Load default summary message.
     # BUG WARNING: This is probably incompatible with the -lang parameter.
     # BUG WARNING: This is probably incompatible with the -lang parameter.
     editSummary = wikipedia.translate(wikipedia.getSite(), msg)
     editSummary = pywikibot.translate(pywikibot.getSite(), msg)
     # Between a regex and another (using -fix) sleep some time (not to waste
     # Between a regex and another (using -fix) sleep some time (not to waste
     # too much CPU
     # too much CPU
Line 510: Line 522:


     # Read commandline parameters.
     # Read commandline parameters.
     for arg in wikipedia.handleArgs(*args):
     for arg in pywikibot.handleArgs(*args):
         if arg == '-regex':
         if arg == '-regex':
             regex = True
             regex = True
         elif arg.startswith('-xmlstart'):
         elif arg.startswith('-xmlstart'):
             if len(arg) == 9:
             if len(arg) == 9:
                 xmlStart = wikipedia.input(
                 xmlStart = pywikibot.input(
                     u'Please enter the dumped article to start with:')
                     u'Please enter the dumped article to start with:')
             else:
             else:
Line 521: Line 533:
         elif arg.startswith('-xml'):
         elif arg.startswith('-xml'):
             if len(arg) == 4:
             if len(arg) == 4:
                 xmlFilename = wikipedia.input(
                 xmlFilename = pywikibot.input(
                     u'Please enter the XML dump\'s filename:')
                     u'Please enter the XML dump\'s filename:')
             else:
             else:
Line 529: Line 541:
         elif arg.startswith('-page'):
         elif arg.startswith('-page'):
             if len(arg) == 5:
             if len(arg) == 5:
                 PageTitles.append(wikipedia.input(
                 PageTitles.append(pywikibot.input(
                                     u'Which page do you want to change?'))
                                     u'Which page do you want to change?'))
             else:
             else:
Line 558: Line 570:
             multiline = True
             multiline = True
         elif arg.startswith('-addcat:'):
         elif arg.startswith('-addcat:'):
             add_cat = arg[len('addcat:'):]
             add_cat = arg[8:]
         elif arg.startswith('-summary:'):
         elif arg.startswith('-summary:'):
             editSummary = arg[len('-summary:'):]
             editSummary = arg[9:]
             summary_commandline = True
             summary_commandline = True
         elif arg.startswith('-allowoverlap'):
         elif arg.startswith('-allowoverlap'):
             allowoverlap = True
             allowoverlap = True
        elif arg.startswith('-query:'):
            maxquerysize = int(arg[7:])
         else:
         else:
             if not genFactory.handleArg(arg):
             if not genFactory.handleArg(arg):
Line 569: Line 583:


     if (len(commandline_replacements) % 2):
     if (len(commandline_replacements) % 2):
         raise wikipedia.Error, 'require even number of replacements.'
         raise pywikibot.Error, 'require even number of replacements.'
     elif (len(commandline_replacements) == 2 and fix is None):
     elif (len(commandline_replacements) == 2 and fix is None):
         replacements.append((commandline_replacements[0],
         replacements.append((commandline_replacements[0],
                             commandline_replacements[1]))
                             commandline_replacements[1]))
         if summary_commandline == False:
         if not summary_commandline:
             editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +'
             editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +'
                                   + commandline_replacements[1] + ')')
                                   + commandline_replacements[1] + ')')
     elif (len(commandline_replacements) > 1):
     elif (len(commandline_replacements) > 1):
Line 581: Line 595:
                 replacements.append((commandline_replacements[i],
                 replacements.append((commandline_replacements[i],
                                     commandline_replacements[i + 1]))
                                     commandline_replacements[i + 1]))
             if summary_commandline == False:
             if not summary_commandline:
                 pairs = [( commandline_replacements[i],
                 pairs = [( commandline_replacements[i],
                           commandline_replacements[i + 1] )
                           commandline_replacements[i + 1] )
Line 587: Line 601:
                 replacementsDescription = '(%s)' % ', '.join(
                 replacementsDescription = '(%s)' % ', '.join(
                     [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                     [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                 editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % replacementsDescription
                 editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % replacementsDescription
         else:
         else:
           raise wikipedia.Error(
           raise pywikibot.Error(
               'Specifying -fix with replacements is undefined')
               'Specifying -fix with replacements is undefined')
     elif fix is None:
     elif fix is None:
         old = wikipedia.input(u'Please enter the text that should be replaced:')
         old = pywikibot.input(u'Please enter the text that should be replaced:')
         new = wikipedia.input(u'Please enter the new text:')
         new = pywikibot.input(u'Please enter the new text:')
         change = '(-' + old + ' +' + new
         change = '(-' + old + ' +' + new
         replacements.append((old, new))
         replacements.append((old, new))
         while True:
         while True:
             old = wikipedia.input(
             old = pywikibot.input(
u'Please enter another text that should be replaced, or press Enter to start:')
u'Please enter another text that should be replaced, or press Enter to start:')
             if old == '':
             if old == '':
                 change = change + ')'
                 change = change + ')'
                 break
                 break
             new = wikipedia.input(u'Please enter the new text:')
             new = pywikibot.input(u'Please enter the new text:')
             change = change + ' & -' + old + ' +' + new
             change = change + ' & -' + old + ' +' + new
             replacements.append((old, new))
             replacements.append((old, new))
         if not summary_commandline == True:
         if not summary_commandline:
             default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
             default_summary_message =  pywikibot.translate(pywikibot.getSite(), msg) % change
             wikipedia.output(u'The summary message will default to: %s'
             pywikibot.output(u'The summary message will default to: %s'
                             % default_summary_message)
                             % default_summary_message)
             summary_message = wikipedia.input(
             summary_message = pywikibot.input(
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
             if summary_message == '':
             if summary_message == '':
Line 620: Line 634:
             fix = fixes.fixes[fix]
             fix = fixes.fixes[fix]
         except KeyError:
         except KeyError:
             wikipedia.output(u'Available predefined fixes are: %s'
             pywikibot.output(u'Available predefined fixes are: %s'
                             % fixes.fixes.keys())
                             % fixes.fixes.keys())
             return
             return
         if 'regex' in fix:
         if "regex" in fix:
             regex = fix['regex']
             regex = fix['regex']
         if 'msg' in fix:
         if "msg" in fix:
             editSummary = wikipedia.translate(wikipedia.getSite(), fix['msg'])
             editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg'])
         if 'exceptions' in fix:
         if "exceptions" in fix:
             exceptions = fix['exceptions']
             exceptions = fix['exceptions']
         if 'nocase' in fix:
         if "nocase" in fix:
             caseInsensitive = fix['nocase']
             caseInsensitive = fix['nocase']
         replacements = fix['replacements']
         replacements = fix['replacements']
Line 684: Line 698:
         gen = pagegenerators.MySQLPageGenerator(query)
         gen = pagegenerators.MySQLPageGenerator(query)
     elif PageTitles:
     elif PageTitles:
         pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
         pages = [pywikibot.Page(pywikibot.getSite(), PageTitle)
                 for PageTitle in PageTitles]
                 for PageTitle in PageTitles]
         gen = iter(pages)
         gen = iter(pages)
Line 691: Line 705:
     if not gen:
     if not gen:
         # syntax error, show help text from the top of this file
         # syntax error, show help text from the top of this file
         wikipedia.showHelp('replace')
         pywikibot.showHelp('replace')
         return
         return
     if xmlFilename:
     if xmlFilename:
Line 699: Line 713:
                                             pageNumber=20, lookahead=100)
                                             pageNumber=20, lookahead=100)
     else:
     else:
         preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
         preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize)
     bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary)
     bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary)
     bot.run()
     bot.run()


if __name__ == "__main__":
if __name__ == "__main__":
Line 707: Line 722:
         main()
         main()
     finally:
     finally:
         wikipedia.stopme()
         pywikibot.stopme()
</pre>

Revision as of 08:40, 8 June 2010

Standard replace.py, June 2010:

# -*- coding: utf-8 -*-
"""
This bot will make direct text replacements. It will retrieve information on
which pages might need changes either from an XML dump or a text file, or only
change a single page.

These command line parameters can be used to specify which pages to work on:

&params;

-xml              Retrieve information from a local XML dump (pages-articles
                  or pages-meta-current, see http://download.wikimedia.org).
                  Argument can also be given as "-xml:filename".

-page             Only edit a specific page.
                  Argument can also be given as "-page:pagetitle". You can
                  give this parameter multiple times to edit multiple pages.

Furthermore, the following command line parameters are supported:

-regex            Make replacements using regular expressions. If this argument
                  isn't given, the bot will make simple text replacements.

-nocase           Use case insensitive regular expressions.

-dotall           Make the dot match any character at all, including a newline.
                  Without this flag, '.' will match anything except a newline.

-multiline        '^' and '$' will now match begin and end of each line.

-xmlstart         (Only works with -xml) Skip all articles in the XML dump
                  before the one specified (may also be given as
                  -xmlstart:Article).

-addcat:cat_name  Adds "cat_name" category to every altered page.

-excepttitle:XYZ  Skip pages with titles that contain XYZ. If the -regex
                  argument is given, XYZ will be regarded as a regular
                  expression.

-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex
                  argument is given, XYZ will be regarded as a regular
                  expression.

-excepttext:XYZ   Skip pages which contain the text XYZ. If the -regex
                  argument is given, XYZ will be regarded as a regular
                  expression.

-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie
                  within XYZ. If the -regex argument is given, XYZ will be
                  regarded as a regular expression.

-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie
                  within an XYZ tag.

-summary:XYZ      Set the summary message text for the edit to XYZ, bypassing
                  the predefined message texts with original and replacements
                  inserted.

-sleep:123        If you use -fix you can check multiple regex at the same time
                  in every page. This can lead to a great waste of CPU because
                  the bot will check every regex without waiting using all the
                  resources. This will slow it down between a regex and another
                  in order not to waste too much CPU.
 
-query:           The maximum number of pages that the bot will load at once.
                  Default value is 60. Ignored when reading an XML file.

-fix:XYZ          Perform one of the predefined replacements tasks, which are
                  given in the dictionary 'fixes' defined inside the file
                  fixes.py.
                  The -regex and -nocase argument and given replacements will
                  be ignored if you use -fix.
                  Currently available predefined fixes are:
&fixes-help;

-always           Don't prompt you for each replacement

-recursive        Recurse replacement as long as possible. Be careful, this
                  might lead to an infinite loop.

-allowoverlap     When occurences of the pattern overlap, replace all of them.
                  Be careful, this might lead to an infinite loop.

other:            First argument is the old text, second argument is the new
                  text. If the -regex argument is given, the first argument
                  will be regarded as a regular expression, and the second
                  argument might contain expressions like \\1 or \g<name>.
                  It is possible to introduce more than one pair of old text
                  and replacement.

Examples:

If you want to change templates from the old syntax, e.g. {{msg:Stub}}, to the
new syntax, e.g. {{Stub}}, download an XML dump file (pages-articles) from
http://download.wikimedia.org, then use this command:

    python replace.py -xml -regex "{{msg:(.*?)}}" "{{\\1}}"

If you have a dump called foobar.xml and want to fix typos in articles, e.g.
Errror -> Error, use this:

    python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0

If you want to do more than one replacement at a time, use this:
    python replace.py -xml:foobar.xml "Errror" "Error" "Faail" "Fail" -namespace:0

If you have a page called 'John Doe' and want to fix the format of ISBNs, use:

    python replace.py -page:John_Doe -fix:isbn

This command will change 'referer' to 'referrer', but not in pages which
talk about HTTP, where the typo has become part of the standard:

    python replace.py referer referrer -file:typos.txt -excepttext:HTTP
"""
from __future__ import generators
#
# (C) Daniel Herding & the Pywikipedia team, 2004-2009
#
__version__='$Id: replace.py 7695 2009-11-26 09:28:38Z alexsh $'
#
# Distributed under the terms of the MIT license.
#

import sys, re, time
import wikipedia as pywikibot
import pagegenerators
import editarticle
import webbrowser

# Imports predefined replacements tasks from fixes.py
import fixes

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;':     pagegenerators.parameterHelp,
    '&fixes-help;': fixes.help,
}


# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.
msg = {
    'ar': u'%s روبوت : استبدال تلقائي للنص',
    'ca': u'Robot: Reemplaçament automàtic de text %s',
    'cs': u'Robot automaticky nahradil text: %s',
    'de': u'Bot: Automatisierte Textersetzung %s',
    'el': u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s',
    'en': u'Robot: Automated text replacement %s',
    'es': u'Robot: Reemplazo automático de texto %s',
    'fa': u'ربات: تغییر خودکار متن %s',
    'fi': u'Botti korvasi automaattisesti tekstin %s',
    'fr': u'Robot : Remplacement de texte automatisé %s',
    'he': u'בוט: החלפת טקסט אוטומטית %s',
    'hu': u'Robot: Automatikus szövegcsere %s',
    'ia': u'Robot: Reimplaciamento automatic de texto %s',
    'id': u'Bot: Penggantian teks otomatis %s',
    'is': u'Vélmenni: breyti texta %s',
    'it': u'Bot: Sostituzione automatica %s',
    'ja': u'ロボットによる: 文字置き換え %s',
    'ka': u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
    'kk': u'Бот: Мәтінді өздікті алмастырды: %s',
    'ksh': u'Bot: hät outomatesch Täx jetuusch: %s',
    'lt': u'robotas: Automatinis teksto keitimas %s',
    'nds': u'Bot: Text automaatsch utwesselt: %s',
    'nds-nl': u'Bot: autematisch tekse vervungen %s',
    'nl': u'Bot: automatisch tekst vervangen %s',
    'nn': u'robot: automatisk teksterstatning: %s',
    'no': u'robot: automatisk teksterstatning: %s',
    'pl': u'Robot automatycznie zamienia tekst %s',
    'pt': u'Bot: Mudança automática %s',
    'ru': u'Робот: Автоматизированная замена текста %s',
    'sr': u'Бот: Аутоматска замена текста %s',
    'sv': u'Bot: Automatisk textersättning: %s',
    'uk': u'Бот: Автоматизована заміна тексту: %s',
    'zh': u'機器人:執行文字代換作業 %s',
}


class XmlDumpReplacePageGenerator:
    """
    Iterator that will yield Pages that might contain text to replace.

    These pages will be retrieved from a local XML dump file.
    Arguments:
        * xmlFilename  - The dump's path, either absolute or relative
        * xmlStart     - Skip all articles in the dump before this one
        * replacements - A list of 2-tuples of original text (as a
                         compiled regular expression) and replacement
                         text (as a string).
        * exceptions   - A dictionary which defines when to ignore an
                         occurence. See docu of the ReplaceRobot
                         constructor below.

    """
    def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
        self.xmlStart = xmlStart
        self.skipping = bool(xmlStart)

        self.excsInside = []
        if "inside-tags" in self.exceptions:
            self.excsInside += self.exceptions['inside-tags']
        if "inside" in self.exceptions:
            self.excsInside += self.exceptions['inside']
        import xmlreader
        self.site = pywikibot.getSite()
        dump = xmlreader.XmlDump(self.xmlFilename)
        self.parser = dump.parse()

    def __iter__(self):
        try:
            for entry in self.parser:
                if self.skipping:
                    if entry.title != self.xmlStart:
                        continue
                    self.skipping = False
                if not self.isTitleExcepted(entry.title) \
                        and not self.isTextExcepted(entry.text):
                    new_text = entry.text
                    for old, new in self.replacements:
                        new_text = pywikibot.replaceExcept(new_text, old, new, self.excsInside, self.site)
                    if new_text != entry.text:
                        yield pywikibot.Page(self.site, entry.title)
        except KeyboardInterrupt:
            try:
                if not self.skipping:
                    pywikibot.output(
                        u'To resume, use "-xmlstart:%s" on the command line.'
                        % entry.title)
            except NameError:
                pass

    def isTitleExcepted(self, title):
        if "title" in self.exceptions:
            for exc in self.exceptions['title']:
                if exc.search(title):
                    return True
        if "require-title" in self.exceptions:
            for req in self.exceptions['require-title']:
                if not req.search(title): # if not all requirements are met:
                    return True

        return False

    def isTextExcepted(self, text):
        if "text-contains" in self.exceptions:
            for exc in self.exceptions['text-contains']:
                if exc.search(text):
                    return True
        return False


class ReplaceRobot:
    """
    A bot that can do text replacements.
    """
    def __init__(self, generator, replacements, exceptions={},
                 acceptall=False, allowoverlap=False, recursive=False,
                 addedCat=None, sleep=None, editSummary=''):
        """
        Arguments:
            * generator    - A generator that yields Page objects.
            * replacements - A list of 2-tuples of original text (as a
                             compiled regular expression) and replacement
                             text (as a string).
            * exceptions   - A dictionary which defines when not to change an
                             occurence. See below.
            * acceptall    - If True, the user won't be prompted before changes
                             are made.
            * allowoverlap - If True, when matches overlap, all of them are
                             replaced.
            * addedCat     - If set to a value, add this category to every page
                             touched.

        Structure of the exceptions dictionary:
        This dictionary can have these keys:

            title
                A list of regular expressions. All pages with titles that
                are matched by one of these regular expressions are skipped.
            text-contains
                A list of regular expressions. All pages with text that
                contains a part which is matched by one of these regular
                expressions are skipped.
            inside
                A list of regular expressions. All occurences are skipped which
                lie within a text region which is matched by one of these
                regular expressions.
            inside-tags
                A list of strings. These strings must be keys from the
                exceptionRegexes dictionary in pywikibot.replaceExcept().

        """
        self.generator = generator
        self.replacements = replacements
        self.exceptions = exceptions
        self.acceptall = acceptall
        self.allowoverlap = allowoverlap
        self.recursive = recursive
        if addedCat:
            site = pywikibot.getSite()
            self.addedCat = pywikibot.Page(site, addedCat, defaultNamespace=14)
        self.sleep = sleep
        # Some function to set default editSummary should probably be added
        self.editSummary = editSummary

    def isTitleExcepted(self, title):
        """
        Iff one of the exceptions applies for the given title, returns True.
        """
        if "title" in self.exceptions:
            for exc in self.exceptions['title']:
                if exc.search(title):
                    return True
        if "require-title" in self.exceptions:
            for req in self.exceptions['require-title']:
                if not req.search(title):
                    return True
        return False

    def isTextExcepted(self, original_text):
        """
        Iff one of the exceptions applies for the given page contents,
        returns True.
        """
        if "text-contains" in self.exceptions:
            for exc in self.exceptions['text-contains']:
                if exc.search(original_text):
                    return True
        return False

    def doReplacements(self, original_text):
        """
        Returns the text which is generated by applying all replacements to
        the given text.
        """
        new_text = original_text
        exceptions = []
        if "inside-tags" in self.exceptions:
            exceptions += self.exceptions['inside-tags']
        if "inside" in self.exceptions:
            exceptions += self.exceptions['inside']
        for old, new in self.replacements:
            if self.sleep is not None:
                time.sleep(self.sleep)
            new_text = pywikibot.replaceExcept(new_text, old, new, exceptions,
                                               allowoverlap=self.allowoverlap)
        return new_text

    def run(self):
        """
        Starts the robot.
        """
        # Run the generator which will yield Pages which might need to be
        # changed.
        for page in self.generator:
            if self.isTitleExcepted(page.title()):
                pywikibot.output(
                    u'Skipping %s because the title is on the exceptions list.'
                    % page.aslink())
                continue
            try:
                # Load the page's text from the wiki
                original_text = page.get(get_redirect=True)
                if not page.canBeEdited():
                    pywikibot.output(u"You can't edit page %s"
                                     % page.aslink())
                    continue
            except pywikibot.NoPage:
                pywikibot.output(u'Page %s not found' % page.aslink())
                continue
            new_text = original_text
            while True:
                if self.isTextExcepted(new_text):
                    pywikibot.output(
    u'Skipping %s because it contains text that is on the exceptions list.'
                        % page.aslink())
                    break
                new_text = self.doReplacements(new_text)
                if new_text == original_text:
                    pywikibot.output(u'No changes were necessary in %s'
                                      % page.aslink())
                    break
                if self.recursive:
                    newest_text = self.doReplacements(new_text)
                    while (newest_text!=new_text):
                        new_text = newest_text
                        newest_text = self.doReplacements(new_text)
                if hasattr(self, "addedCat"):
                    cats = page.categories()
                    if self.addedCat not in cats:
                        cats.append(self.addedCat)
                        new_text = pywikibot.replaceCategoryLinks(new_text,
                                                                  cats)
                # Show the title of the page we're working on.
                # Highlight the title in purple.
                pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                                 % page.title())
                pywikibot.showDiff(original_text, new_text)
                if self.acceptall:
                    break
                choice = pywikibot.inputChoice(
                            u'Do you want to accept these changes?',
                            ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
                            ['y', 'N', 'e', 'b', 'a', 'q'], 'N')
                if choice == 'e':
                    editor = editarticle.TextEditor()
                    as_edited = editor.edit(original_text)
                    # if user didn't press Cancel
                    if as_edited and as_edited != new_text:
                        new_text = as_edited
                    continue
                if choice == 'b':
                    webbrowser.open("http://%s%s" % (
                        page.site().hostname(),
                        page.site().nice_get_address(page.title())
                    ))
                    pywikibot.input("Press Enter when finished in browser.")
                    original_text = page.get(get_redirect=True, force=True)
                    new_text = original_text
                    continue
                if choice == 'q':
                    return
                if choice == 'a':
                    self.acceptall = True
                if choice == 'y':
                    page.put_async(new_text, self.editSummary)
                # choice must be 'N'
                break
            if self.acceptall and new_text != original_text:
                try:
                    page.put(new_text, self.editSummary)
                except pywikibot.EditConflict:
                    pywikibot.output(u'Skipping %s because of edit conflict'
                                     % (page.title(),))
                except pywikibot.SpamfilterError, e:
                    pywikibot.output(
                        u'Cannot change %s because of blacklist entry %s'
                        % (page.title(), e.url))
                except pywikibot.PageNotSaved, error:
                    pywikibot.output(u'Error putting page: %s'
                                     % (error.args,))
                except pywikibot.LockedPage:
                    pywikibot.output(u'Skipping %s (locked page)'
                                     % (page.title(),))

def prepareRegexForMySQL(pattern):
    pattern = pattern.replace('\s', '[:space:]')
    pattern = pattern.replace('\d', '[:digit:]')
    pattern = pattern.replace('\w', '[:alnum:]')

    pattern = pattern.replace("'", "\\" + "'")
    #pattern = pattern.replace('\\', '\\\\')
    #for char in ['[', ']', "'"]:
    #    pattern = pattern.replace(char, '\%s' % char)
    return pattern


def main(*args):
    add_cat = None
    gen = None
    # summary message
    summary_commandline = False
    # Array which will collect commandline parameters.
    # First element is original text, second element is replacement text.
    commandline_replacements = []
    # A list of 2-tuples of original text and replacement text.
    replacements = []
    # Don't edit pages which contain certain texts.
    exceptions = {
        'title':         [],
        'text-contains': [],
        'inside':        [],
        'inside-tags':   [],
        'require-title': [], # using a seperate requirements dict needs some
    }                        # major refactoring of code.

    # Should the elements of 'replacements' and 'exceptions' be interpreted
    # as regular expressions?
    regex = False
    # Predefined fixes from dictionary 'fixes' (see above).
    fix = None
    # the dump's path, either absolute or relative, which will be used
    # if -xml flag is present
    xmlFilename = None
    useSql = False
    PageTitles = []
    # will become True when the user presses a ('yes to all') or uses the
    # -always flag.
    acceptall = False
    # Will become True if the user inputs the commandline parameter -nocase
    caseInsensitive = False
    # Will become True if the user inputs the commandline parameter -dotall
    dotall = False
    # Will become True if the user inputs the commandline parameter -multiline
    multiline = False
    # Do all hits when they overlap
    allowoverlap = False
    # Do not recurse replacement
    recursive = False
    # This is the maximum number of pages to load per query    
    maxquerysize = 60
    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    # Load default summary message.
    # BUG WARNING: This is probably incompatible with the -lang parameter.
    editSummary = pywikibot.translate(pywikibot.getSite(), msg)
    # Between a regex and another (using -fix) sleep some time (not to waste
    # too much CPU
    sleep = None

    # Read commandline parameters.
    for arg in pywikibot.handleArgs(*args):
        if arg == '-regex':
            regex = True
        elif arg.startswith('-xmlstart'):
            if len(arg) == 9:
                xmlStart = pywikibot.input(
                    u'Please enter the dumped article to start with:')
            else:
                xmlStart = arg[10:]
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = pywikibot.input(
                    u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        elif arg =='-sql':
            useSql = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(pywikibot.input(
                                    u'Which page do you want to change?'))
            else:
                PageTitles.append(arg[6:])
        elif arg.startswith('-excepttitle:'):
            exceptions['title'].append(arg[13:])
        elif arg.startswith('-requiretitle:'):
            exceptions['require-title'].append(arg[14:])
        elif arg.startswith('-excepttext:'):
            exceptions['text-contains'].append(arg[12:])
        elif arg.startswith('-exceptinside:'):
            exceptions['inside'].append(arg[14:])
        elif arg.startswith('-exceptinsidetag:'):
            exceptions['inside-tags'].append(arg[17:])
        elif arg.startswith('-fix:'):
            fix = arg[5:]
        elif arg.startswith('-sleep:'):
            sleep = float(arg[7:])
        elif arg == '-always':
            acceptall = True
        elif arg == '-recursive':
            recursive = True
        elif arg == '-nocase':
            caseInsensitive = True
        elif arg == '-dotall':
            dotall = True
        elif arg == '-multiline':
            multiline = True
        elif arg.startswith('-addcat:'):
            add_cat = arg[8:]
        elif arg.startswith('-summary:'):
            editSummary = arg[9:]
            summary_commandline = True
        elif arg.startswith('-allowoverlap'):
            allowoverlap = True
        elif arg.startswith('-query:'):
            maxquerysize = int(arg[7:])
        else:
            if not genFactory.handleArg(arg):
                commandline_replacements.append(arg)

    if (len(commandline_replacements) % 2):
        raise pywikibot.Error, 'require even number of replacements.'
    elif (len(commandline_replacements) == 2 and fix is None):
        replacements.append((commandline_replacements[0],
                             commandline_replacements[1]))
        if not summary_commandline:
            editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +'
                                   + commandline_replacements[1] + ')')
    elif (len(commandline_replacements) > 1):
        if (fix is None):
            for i in xrange (0, len(commandline_replacements), 2):
                replacements.append((commandline_replacements[i],
                                     commandline_replacements[i + 1]))
            if not summary_commandline:
                pairs = [( commandline_replacements[i],
                           commandline_replacements[i + 1] )
                         for i in range(0, len(commandline_replacements), 2)]
                replacementsDescription = '(%s)' % ', '.join(
                    [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % replacementsDescription
        else:
           raise pywikibot.Error(
               'Specifying -fix with replacements is undefined')
    elif fix is None:
        old = pywikibot.input(u'Please enter the text that should be replaced:')
        new = pywikibot.input(u'Please enter the new text:')
        change = '(-' + old + ' +' + new
        replacements.append((old, new))
        while True:
            old = pywikibot.input(
u'Please enter another text that should be replaced, or press Enter to start:')
            if old == '':
                change = change + ')'
                break
            new = pywikibot.input(u'Please enter the new text:')
            change = change + ' & -' + old + ' +' + new
            replacements.append((old, new))
        if not summary_commandline:
            default_summary_message =  pywikibot.translate(pywikibot.getSite(), msg) % change
            pywikibot.output(u'The summary message will default to: %s'
                             % default_summary_message)
            summary_message = pywikibot.input(
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
            if summary_message == '':
                summary_message = default_summary_message
            editSummary = summary_message

    else:
        # Perform one of the predefined actions.
        try:
            fix = fixes.fixes[fix]
        except KeyError:
            pywikibot.output(u'Available predefined fixes are: %s'
                             % fixes.fixes.keys())
            return
        if "regex" in fix:
            regex = fix['regex']
        if "msg" in fix:
            editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg'])
        if "exceptions" in fix:
            exceptions = fix['exceptions']
        if "nocase" in fix:
            caseInsensitive = fix['nocase']
        replacements = fix['replacements']

    #Set the regular expression flags
    flags = re.UNICODE
    if caseInsensitive:
        flags = flags | re.IGNORECASE
    if dotall:
        flags = flags | re.DOTALL
    if multiline:
        flags = flags | re.MULTILINE

    # Pre-compile all regular expressions here to save time later
    for i in range(len(replacements)):
        old, new = replacements[i]
        if not regex:
            old = re.escape(old)
        oldR = re.compile(old, flags)
        replacements[i] = oldR, new

    for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']:
        if exceptionCategory in exceptions:
            patterns = exceptions[exceptionCategory]
            if not regex:
                patterns = [re.escape(pattern) for pattern in patterns]
            patterns = [re.compile(pattern, flags) for pattern in patterns]
            exceptions[exceptionCategory] = patterns

    if xmlFilename:
        try:
            xmlStart
        except NameError:
            xmlStart = None
        gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart,
                                          replacements, exceptions)
    elif useSql:
        whereClause = 'WHERE (%s)' % ' OR '.join(
            ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
             for (old, new) in replacements])
        if exceptions:
            exceptClause = 'AND NOT (%s)' % ' OR '.join(
                ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern)
                 for exc in exceptions])
        else:
            exceptClause = ''
        query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
        gen = pagegenerators.MySQLPageGenerator(query)
    elif PageTitles:
        pages = [pywikibot.Page(pywikibot.getSite(), PageTitle)
                 for PageTitle in PageTitles]
        gen = iter(pages)

    gen = genFactory.getCombinedGenerator(gen)
    if not gen:
        # syntax error, show help text from the top of this file
        pywikibot.showHelp('replace')
        return
    if xmlFilename:
        # XML parsing can be quite slow, so use smaller batches and
        # longer lookahead.
        preloadingGen = pagegenerators.PreloadingGenerator(gen,
                                            pageNumber=20, lookahead=100)
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize)
    bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary)
    bot.run()


if __name__ == "__main__":
    try:
        main()
    finally:
        pywikibot.stopme()
Cookies help us deliver our services. By using our services, you agree to our use of cookies.