No edit summary
(my mod)
Line 1: Line 1:
Standard replace.py, June 2010:
My version of replace.py, I think modified sometime in 2009:


# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
Line 65: Line 65:
                   resources. This will slow it down between a regex and another
                   resources. This will slow it down between a regex and another
                   in order not to waste too much CPU.
                   in order not to waste too much CPU.
-query:          The maximum number of pages that the bot will load at once.
                  Default value is 60. Ignored when reading an XML file.


-fix:XYZ          Perform one of the predefined replacements tasks, which are
-fix:XYZ          Perform one of the predefined replacements tasks, which are
Line 89: Line 86:
                   will be regarded as a regular expression, and the second
                   will be regarded as a regular expression, and the second
                   argument might contain expressions like \\1 or \g<name>.
                   argument might contain expressions like \\1 or \g<name>.
                  It is possible to introduce more than one pair of old text
                  and replacement.


Examples:
Examples:
Line 104: Line 99:


     python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
     python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
If you want to do more than one replacement at a time, use this:
    python replace.py -xml:foobar.xml "Errror" "Error" "Faail" "Fail" -namespace:0


If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
Line 117: Line 109:
     python replace.py referer referrer -file:typos.txt -excepttext:HTTP
     python replace.py referer referrer -file:typos.txt -excepttext:HTTP
"""
"""
from __future__ import generators
#
#
# (C) Daniel Herding & the Pywikipedia team, 2004-2009
# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
#
__version__='$Id: replace.py 7695 2009-11-26 09:28:38Z alexsh $'
#
#
# Distributed under the terms of the MIT license.
# Distributed under the terms of the MIT license.
#
#


from __future__ import generators
import sys, re, time
import sys, re, time
import wikipedia as pywikibot
import wikipedia, pagegenerators
import pagegenerators
import editarticle
import editarticle
import webbrowser
import webbrowser
Line 142: Line 131:
}
}


__version__='$Id: replace.py 6844 2009-05-07 09:27:39Z siebrand $'


# Summary messages in different languages
# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.
# below.`v
msg = {
msg = {
     'ar': u'%s روبوت : استبدال تلقائي للنص',
     'ar': u'%s روبوت : استبدال تلقائي للنص',
Line 178: Line 168:
     'sr': u'Бот: Аутоматска замена текста %s',
     'sr': u'Бот: Аутоматска замена текста %s',
     'sv': u'Bot: Automatisk textersättning: %s',
     'sv': u'Bot: Automatisk textersättning: %s',
    'uk': u'Бот: Автоматизована заміна тексту: %s',
     'zh': u'機器人:執行文字代換作業 %s',
     'zh': u'機器人:執行文字代換作業 %s',
}
}
Line 207: Line 196:


         self.excsInside = []
         self.excsInside = []
         if "inside-tags" in self.exceptions:
         if 'inside-tags' in self.exceptions:
             self.excsInside += self.exceptions['inside-tags']
             self.excsInside += self.exceptions['inside-tags']
         if "inside" in self.exceptions:
         if 'inside' in self.exceptions:
             self.excsInside += self.exceptions['inside']
             self.excsInside += self.exceptions['inside']
         import xmlreader
         import xmlreader
         self.site = pywikibot.getSite()
         self.site = wikipedia.getSite()
         dump = xmlreader.XmlDump(self.xmlFilename)
         dump = xmlreader.XmlDump(self.xmlFilename)
         self.parser = dump.parse()
         self.parser = dump.parse()
Line 227: Line 216:
                     new_text = entry.text
                     new_text = entry.text
                     for old, new in self.replacements:
                     for old, new in self.replacements:
                         new_text = pywikibot.replaceExcept(new_text, old, new, self.excsInside, self.site)
                         new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site)
                     if new_text != entry.text:
                     if new_text != entry.text:
                         yield pywikibot.Page(self.site, entry.title)
                         yield wikipedia.Page(self.site, entry.title)
         except KeyboardInterrupt:
         except KeyboardInterrupt:
             try:
             try:
                 if not self.skipping:
                 if not self.skipping:
                     pywikibot.output(
                     wikipedia.output(
                         u'To resume, use "-xmlstart:%s" on the command line.'
                         u'To resume, use "-xmlstart:%s" on the command line.'
                         % entry.title)
                         % entry.title)
Line 240: Line 229:


     def isTitleExcepted(self, title):
     def isTitleExcepted(self, title):
         if "title" in self.exceptions:
         if 'title' in self.exceptions:
             for exc in self.exceptions['title']:
             for exc in self.exceptions['title']:
                 if exc.search(title):
                 if exc.search(title):
                     return True
                     return True
         if "require-title" in self.exceptions:
         if 'require-title' in self.exceptions:
             for req in self.exceptions['require-title']:
             for req in self.exceptions['require-title']:
                 if not req.search(title): # if not all requirements are met:
                 if not req.search(title): # if not all requirements are met:
Line 252: Line 241:


     def isTextExcepted(self, text):
     def isTextExcepted(self, text):
         if "text-contains" in self.exceptions:
         if 'text-contains' in self.exceptions:
             for exc in self.exceptions['text-contains']:
             for exc in self.exceptions['text-contains']:
                 if exc.search(text):
                 if exc.search(text):
Line 297: Line 286:
             inside-tags
             inside-tags
                 A list of strings. These strings must be keys from the
                 A list of strings. These strings must be keys from the
                 exceptionRegexes dictionary in pywikibot.replaceExcept().
                 exceptionRegexes dictionary in wikipedia.replaceExcept().


         """
         """
Line 306: Line 295:
         self.allowoverlap = allowoverlap
         self.allowoverlap = allowoverlap
         self.recursive = recursive
         self.recursive = recursive
        # Some function to set default editSummary should probably be added
        self.editSummary = editSummary
         if addedCat:
         if addedCat:
             site = pywikibot.getSite()
             site = wikipedia.getSite()
             self.addedCat = pywikibot.Page(site, addedCat, defaultNamespace=14)
            cat_ns = site.category_namespaces()[0]
             self.addedCat = wikipedia.Page(site,
                                          cat_ns + ':' + addedCat)
         self.sleep = sleep
         self.sleep = sleep
        # Some function to set default editSummary should probably be added
        self.editSummary = editSummary


     def isTitleExcepted(self, title):
     def isTitleExcepted(self, title):
Line 317: Line 308:
         Iff one of the exceptions applies for the given title, returns True.
         Iff one of the exceptions applies for the given title, returns True.
         """
         """
         if "title" in self.exceptions:
         if 'title' in self.exceptions:
             for exc in self.exceptions['title']:
             for exc in self.exceptions['title']:
                 if exc.search(title):
                 if exc.search(title):
                     return True
                     return True
         if "require-title" in self.exceptions:
         if 'require-title' in self.exceptions:
             for req in self.exceptions['require-title']:
             for req in self.exceptions['require-title']:
                 if not req.search(title):
                 if not req.search(title):
Line 332: Line 323:
         returns True.
         returns True.
         """
         """
         if "text-contains" in self.exceptions:
         if 'text-contains' in self.exceptions:
             for exc in self.exceptions['text-contains']:
             for exc in self.exceptions['text-contains']:
                 if exc.search(original_text):
                 if exc.search(original_text):
Line 345: Line 336:
         new_text = original_text
         new_text = original_text
         exceptions = []
         exceptions = []
         if "inside-tags" in self.exceptions:
         if 'inside-tags' in self.exceptions:
             exceptions += self.exceptions['inside-tags']
             exceptions += self.exceptions['inside-tags']
         if "inside" in self.exceptions:
         if 'inside' in self.exceptions:
             exceptions += self.exceptions['inside']
             exceptions += self.exceptions['inside']
         for old, new in self.replacements:
         for old, new in self.replacements:
             if self.sleep is not None:
             if self.sleep is not None:
                 time.sleep(self.sleep)
                 time.sleep(self.sleep)
             new_text = pywikibot.replaceExcept(new_text, old, new, exceptions,
             new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
                                               allowoverlap=self.allowoverlap)
                                               allowoverlap=self.allowoverlap)
         return new_text
         return new_text
Line 364: Line 355:
         for page in self.generator:
         for page in self.generator:
             if self.isTitleExcepted(page.title()):
             if self.isTitleExcepted(page.title()):
                 pywikibot.output(
                 wikipedia.output(
                     u'Skipping %s because the title is on the exceptions list.'
                     u'Skipping %s because the title is on the exceptions list.'
                     % page.aslink())
                     % page.aslink())
Line 372: Line 363:
                 original_text = page.get(get_redirect=True)
                 original_text = page.get(get_redirect=True)
                 if not page.canBeEdited():
                 if not page.canBeEdited():
                     pywikibot.output(u"You can't edit page %s"
                     wikipedia.output(u"You can't edit page %s"
                                     % page.aslink())
                                     % page.aslink())
                     continue
                     continue
             except pywikibot.NoPage:
             except wikipedia.NoPage:
                 pywikibot.output(u'Page %s not found' % page.aslink())
                 wikipedia.output(u'Page %s not found' % page.aslink())
                 continue
                 continue
             new_text = original_text
             new_text = original_text
             while True:
             while True:
                 if self.isTextExcepted(new_text):
                 if self.isTextExcepted(new_text):
                     pywikibot.output(
                     wikipedia.output(
     u'Skipping %s because it contains text that is on the exceptions list.'
     u'Skipping %s because it contains text that is on the exceptions list.'
                         % page.aslink())
                         % page.aslink())
Line 387: Line 378:
                 new_text = self.doReplacements(new_text)
                 new_text = self.doReplacements(new_text)
                 if new_text == original_text:
                 if new_text == original_text:
                     pywikibot.output(u'No changes were necessary in %s'
                     wikipedia.output('No changes were necessary in %s'
                                      % page.aslink())
                                    % page.aslink())
                     break
                     break
                 if self.recursive:
                 if self.recursive:
Line 396: Line 387:
                         newest_text = self.doReplacements(new_text)
                         newest_text = self.doReplacements(new_text)
                 if hasattr(self, "addedCat"):
                 if hasattr(self, "addedCat"):
                     cats = page.categories()
                     cats = page.categories(nofollow_redirects=True)
                     if self.addedCat not in cats:
                     if self.addedCat not in cats:
                         cats.append(self.addedCat)
                         cats.append(self.addedCat)
                         new_text = pywikibot.replaceCategoryLinks(new_text,
                         new_text = wikipedia.replaceCategoryLinks(new_text,
                                                                   cats)
                                                                   cats)
                 # Show the title of the page we're working on.
                 # Show the title of the page we're working on.
                 # Highlight the title in purple.
                 # Highlight the title in purple.
                 pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                 wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                                 % page.title())
                                 % page.title())
                 pywikibot.showDiff(original_text, new_text)
                 wikipedia.showDiff(original_text, new_text)
                 if self.acceptall:
                 if self.acceptall:
                     break
                     break
                 choice = pywikibot.inputChoice(
                 choice = wikipedia.inputChoice(
                             u'Do you want to accept these changes?',
                             u'Do you want to accept these changes?',
                             ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
                             ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
Line 424: Line 415:
                         page.site().nice_get_address(page.title())
                         page.site().nice_get_address(page.title())
                     ))
                     ))
                     pywikibot.input("Press Enter when finished in browser.")
                     wikipedia.input("Press Enter when finished in browser.")
                     original_text = page.get(get_redirect=True, force=True)
                     original_text = page.get(get_redirect=True, force=True)
                     new_text = original_text
                     new_text = original_text
Line 439: Line 430:
                 try:
                 try:
                     page.put(new_text, self.editSummary)
                     page.put(new_text, self.editSummary)
                 except pywikibot.EditConflict:
                 except wikipedia.EditConflict:
                     pywikibot.output(u'Skipping %s because of edit conflict'
                     wikipedia.output(u'Skipping %s because of edit conflict'
                                     % (page.title(),))
                                     % (page.title(),))
                 except pywikibot.SpamfilterError, e:
                 except wikipedia.SpamfilterError, e:
                     pywikibot.output(
                     wikipedia.output(
                         u'Cannot change %s because of blacklist entry %s'
                         u'Cannot change %s because of blacklist entry %s'
                         % (page.title(), e.url))
                         % (page.title(), e.url))
                 except pywikibot.PageNotSaved, error:
                 except wikipedia.PageNotSaved, error:
                     pywikibot.output(u'Error putting page: %s'
                     wikipedia.output(u'Error putting page: %s'
                                     % (error.args,))
                                     % (error.args,))
                 except pywikibot.LockedPage:
                 except wikipedia.LockedPage:
                     pywikibot.output(u'Skipping %s (locked page)'
                     wikipedia.output(u'Skipping %s (locked page)'
                                     % (page.title(),))
                                     % (page.title(),))


Line 507: Line 498:
     # Do not recurse replacement
     # Do not recurse replacement
     recursive = False
     recursive = False
    # This is the maximum number of pages to load per query   
    maxquerysize = 60
     # This factory is responsible for processing command line arguments
     # This factory is responsible for processing command line arguments
     # that are also used by other scripts and that determine on which pages
     # that are also used by other scripts and that determine on which pages
Line 515: Line 504:
     # Load default summary message.
     # Load default summary message.
     # BUG WARNING: This is probably incompatible with the -lang parameter.
     # BUG WARNING: This is probably incompatible with the -lang parameter.
     editSummary = pywikibot.translate(pywikibot.getSite(), msg)
     editSummary = wikipedia.translate(wikipedia.getSite(), msg)
     # Between a regex and another (using -fix) sleep some time (not to waste
     # Between a regex and another (using -fix) sleep some time (not to waste
     # too much CPU
     # too much CPU
Line 521: Line 510:


     # Read commandline parameters.
     # Read commandline parameters.
     for arg in pywikibot.handleArgs(*args):
     for arg in wikipedia.handleArgs(*args):
         if arg == '-regex':
         if arg == '-regex':
             regex = True
             regex = True
         elif arg.startswith('-xmlstart'):
         elif arg.startswith('-xmlstart'):
             if len(arg) == 9:
             if len(arg) == 9:
                 xmlStart = pywikibot.input(
                 xmlStart = wikipedia.input(
                     u'Please enter the dumped article to start with:')
                     u'Please enter the dumped article to start with:')
             else:
             else:
Line 532: Line 521:
         elif arg.startswith('-xml'):
         elif arg.startswith('-xml'):
             if len(arg) == 4:
             if len(arg) == 4:
                 xmlFilename = pywikibot.input(
                 xmlFilename = wikipedia.input(
                     u'Please enter the XML dump\'s filename:')
                     u'Please enter the XML dump\'s filename:')
             else:
             else:
Line 540: Line 529:
         elif arg.startswith('-page'):
         elif arg.startswith('-page'):
             if len(arg) == 5:
             if len(arg) == 5:
                 PageTitles.append(pywikibot.input(
                 PageTitles.append(wikipedia.input(
                                     u'Which page do you want to change?'))
                                     u'Which page do you want to change?'))
             else:
             else:
Line 569: Line 558:
             multiline = True
             multiline = True
         elif arg.startswith('-addcat:'):
         elif arg.startswith('-addcat:'):
             add_cat = arg[8:]
             add_cat = arg[len('addcat:'):]
         elif arg.startswith('-summary:'):
         elif arg.startswith('-summary:'):
             editSummary = arg[9:]
             editSummary = arg[len('-summary:'):]
             summary_commandline = True
             summary_commandline = True
         elif arg.startswith('-allowoverlap'):
         elif arg.startswith('-allowoverlap'):
             allowoverlap = True
             allowoverlap = True
        elif arg.startswith('-query:'):
            maxquerysize = int(arg[7:])
         else:
         else:
             if not genFactory.handleArg(arg):
             if not genFactory.handleArg(arg):
Line 582: Line 569:


     if (len(commandline_replacements) % 2):
     if (len(commandline_replacements) % 2):
         raise pywikibot.Error, 'require even number of replacements.'
         raise wikipedia.Error, 'require even number of replacements.'
     elif (len(commandline_replacements) == 2 and fix is None):
     elif (len(commandline_replacements) == 2 and fix is None):
         replacements.append((commandline_replacements[0],
         replacements.append((commandline_replacements[0],
                             commandline_replacements[1]))
                             commandline_replacements[1]))
         if not summary_commandline:
         if summary_commandline == False:
             editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +'
             editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +'
                                   + commandline_replacements[1] + ')')
                                   + commandline_replacements[1] + ')')
     elif (len(commandline_replacements) > 1):
     elif (len(commandline_replacements) > 1):
Line 594: Line 581:
                 replacements.append((commandline_replacements[i],
                 replacements.append((commandline_replacements[i],
                                     commandline_replacements[i + 1]))
                                     commandline_replacements[i + 1]))
             if not summary_commandline:
             if summary_commandline == False:
                 pairs = [( commandline_replacements[i],
                 pairs = [( commandline_replacements[i],
                           commandline_replacements[i + 1] )
                           commandline_replacements[i + 1] )
Line 600: Line 587:
                 replacementsDescription = '(%s)' % ', '.join(
                 replacementsDescription = '(%s)' % ', '.join(
                     [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                     [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                 editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % replacementsDescription
                 editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % replacementsDescription
         else:
         else:
           raise pywikibot.Error(
           raise wikipedia.Error(
               'Specifying -fix with replacements is undefined')
               'Specifying -fix with replacements is undefined')
     elif fix is None:
     elif fix is None:
         old = pywikibot.input(u'Please enter the text that should be replaced:')
         old = wikipedia.input(u'Please enter the text that should be replaced:')
         new = pywikibot.input(u'Please enter the new text:')
         new = wikipedia.input(u'Please enter the new text:')
         change = '(-' + old + ' +' + new
         change = '(-' + old + ' +' + new
         replacements.append((old, new))
         replacements.append((old, new))
         while True:
         while True:
             old = pywikibot.input(
             old = wikipedia.input(
u'Please enter another text that should be replaced, or press Enter to start:')
u'Please enter another text that should be replaced, or press Enter to start:')
             if old == '':
             if old == '':
                 change = change + ')'
                 change = change + ')'
                 break
                 break
             new = pywikibot.input(u'Please enter the new text:')
             new = wikipedia.input(u'Please enter the new text:')
             change = change + ' & -' + old + ' +' + new
             change = change + ' & -' + old + ' +' + new
             replacements.append((old, new))
             replacements.append((old, new))
         if not summary_commandline:
         if not summary_commandline == True:
             default_summary_message =  pywikibot.translate(pywikibot.getSite(), msg) % change
             default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
             pywikibot.output(u'The summary message will default to: %s'
             wikipedia.output(u'The summary message will default to: %s'
                             % default_summary_message)
                             % default_summary_message)
             summary_message = pywikibot.input(
             summary_message = wikipedia.input(
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
             if summary_message == '':
             if summary_message == '':
Line 633: Line 620:
             fix = fixes.fixes[fix]
             fix = fixes.fixes[fix]
         except KeyError:
         except KeyError:
             pywikibot.output(u'Available predefined fixes are: %s'
             wikipedia.output(u'Available predefined fixes are: %s'
                             % fixes.fixes.keys())
                             % fixes.fixes.keys())
             return
             return
         if "regex" in fix:
         if 'regex' in fix:
             regex = fix['regex']
             regex = fix['regex']
         if "msg" in fix:
         if 'msg' in fix:
             editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg'])
             editSummary = wikipedia.translate(wikipedia.getSite(), fix['msg'])
         if "exceptions" in fix:
         if 'exceptions' in fix:
             exceptions = fix['exceptions']
             exceptions = fix['exceptions']
         if "nocase" in fix:
         if 'nocase' in fix:
             caseInsensitive = fix['nocase']
             caseInsensitive = fix['nocase']
         replacements = fix['replacements']
         replacements = fix['replacements']
Line 697: Line 684:
         gen = pagegenerators.MySQLPageGenerator(query)
         gen = pagegenerators.MySQLPageGenerator(query)
     elif PageTitles:
     elif PageTitles:
         pages = [pywikibot.Page(pywikibot.getSite(), PageTitle)
         pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
                 for PageTitle in PageTitles]
                 for PageTitle in PageTitles]
         gen = iter(pages)
         gen = iter(pages)
Line 704: Line 691:
     if not gen:
     if not gen:
         # syntax error, show help text from the top of this file
         # syntax error, show help text from the top of this file
         pywikibot.showHelp('replace')
         wikipedia.showHelp('replace')
         return
         return
     if xmlFilename:
     if xmlFilename:
Line 712: Line 699:
                                             pageNumber=20, lookahead=100)
                                             pageNumber=20, lookahead=100)
     else:
     else:
         preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize)
         preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
     bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary)
     bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary)
     bot.run()
     bot.run()


if __name__ == "__main__":
if __name__ == "__main__":
Line 721: Line 707:
         main()
         main()
     finally:
     finally:
         pywikibot.stopme()
         wikipedia.stopme()

Revision as of 08:33, 8 June 2010

My version of replace.py, I think modified sometime in 2009:

  1. -*- coding: utf-8 -*-

""" This bot will make direct text replacements. It will retrieve information on which pages might need changes either from an XML dump or a text file, or only change a single page.

These command line parameters can be used to specify which pages to work on:

&params;

-xml Retrieve information from a local XML dump (pages-articles

                 or pages-meta-current, see http://download.wikimedia.org).
                 Argument can also be given as "-xml:filename".

-page Only edit a specific page.

                 Argument can also be given as "-page:pagetitle". You can
                 give this parameter multiple times to edit multiple pages.

Furthermore, the following command line parameters are supported:

-regex Make replacements using regular expressions. If this argument

                 isn't given, the bot will make simple text replacements.

-nocase Use case insensitive regular expressions.

-dotall Make the dot match any character at all, including a newline.

                 Without this flag, '.' will match anything except a newline.

-multiline '^' and '$' will now match begin and end of each line.

-xmlstart (Only works with -xml) Skip all articles in the XML dump

                 before the one specified (may also be given as
                 -xmlstart:Article).

-addcat:cat_name Adds "cat_name" category to every altered page.

-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex

                 argument is given, XYZ will be regarded as a regular
                 expression.

-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex

                 argument is given, XYZ will be regarded as a regular
                 expression.

-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex

                 argument is given, XYZ will be regarded as a regular
                 expression.

-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie

                 within XYZ. If the -regex argument is given, XYZ will be
                 regarded as a regular expression.

-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie

                 within an XYZ tag.

-summary:XYZ Set the summary message text for the edit to XYZ, bypassing

                 the predefined message texts with original and replacements
                 inserted.

-sleep:123 If you use -fix you can check multiple regex at the same time

                 in every page. This can lead to a great waste of CPU because
                 the bot will check every regex without waiting using all the
                 resources. This will slow it down between a regex and another
                 in order not to waste too much CPU.

-fix:XYZ Perform one of the predefined replacements tasks, which are

                 given in the dictionary 'fixes' defined inside the file
                 fixes.py.
                 The -regex and -nocase argument and given replacements will
                 be ignored if you use -fix.
                 Currently available predefined fixes are:

&fixes-help;

-always Don't prompt you for each replacement

-recursive Recurse replacement as long as possible. Be careful, this

                 might lead to an infinite loop.

-allowoverlap When occurences of the pattern overlap, replace all of them.

                 Be careful, this might lead to an infinite loop.

other: First argument is the old text, second argument is the new

                 text. If the -regex argument is given, the first argument
                 will be regarded as a regular expression, and the second
                 argument might contain expressions like \\1 or \g<name>.

Examples:

If you want to change templates from the old syntax, e.g. Template:Stub, to the new syntax, e.g. Template:Stub, download an XML dump file (pages-articles) from http://download.wikimedia.org, then use this command:

   python replace.py -xml -regex "Template:(.*?)" "Template:\\1"

If you have a dump called foobar.xml and want to fix typos in articles, e.g. Errror -> Error, use this:

   python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0

If you have a page called 'John Doe' and want to fix the format of ISBNs, use:

   python replace.py -page:John_Doe -fix:isbn

This command will change 'referer' to 'referrer', but not in pages which talk about HTTP, where the typo has become part of the standard:

   python replace.py referer referrer -file:typos.txt -excepttext:HTTP

"""

  1. (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
  2. Distributed under the terms of the MIT license.

from __future__ import generators import sys, re, time import wikipedia, pagegenerators import editarticle import webbrowser

  1. Imports predefined replacements tasks from fixes.py

import fixes

  1. This is required for the text that is shown when you run this script
  2. with the parameter -help.

docuReplacements = {

   '&params;':     pagegenerators.parameterHelp,
   '&fixes-help;': fixes.help,

}

__version__='$Id: replace.py 6844 2009-05-07 09:27:39Z siebrand $'

  1. Summary messages in different languages
  2. NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
  3. below.`v

msg = {

   'ar': u'%s روبوت : استبدال تلقائي للنص',
   'ca': u'Robot: Reemplaçament automàtic de text %s',
   'cs': u'Robot automaticky nahradil text: %s',
   'de': u'Bot: Automatisierte Textersetzung %s',
   'el': u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s',
   'en': u'Robot: Automated text replacement %s',
   'es': u'Robot: Reemplazo automático de texto %s',
   'fa': u'ربات: تغییر خودکار متن %s',
   'fi': u'Botti korvasi automaattisesti tekstin %s',
   'fr': u'Robot : Remplacement de texte automatisé %s',
   'he': u'בוט: החלפת טקסט אוטומטית %s',
   'hu': u'Robot: Automatikus szövegcsere %s',
   'ia': u'Robot: Reimplaciamento automatic de texto %s',
   'id': u'Bot: Penggantian teks otomatis %s',
   'is': u'Vélmenni: breyti texta %s',
   'it': u'Bot: Sostituzione automatica %s',
   'ja': u'ロボットによる: 文字置き換え %s',
   'ka': u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
   'kk': u'Бот: Мәтінді өздікті алмастырды: %s',
   'ksh': u'Bot: hät outomatesch Täx jetuusch: %s',
   'lt': u'robotas: Automatinis teksto keitimas %s',
   'nds': u'Bot: Text automaatsch utwesselt: %s',
   'nds-nl': u'Bot: autematisch tekse vervungen %s',
   'nl': u'Bot: automatisch tekst vervangen %s',
   'nn': u'robot: automatisk teksterstatning: %s',
   'no': u'robot: automatisk teksterstatning: %s',
   'pl': u'Robot automatycznie zamienia tekst %s',
   'pt': u'Bot: Mudança automática %s',
   'ru': u'Робот: Автоматизированная замена текста %s',
   'sr': u'Бот: Аутоматска замена текста %s',
   'sv': u'Bot: Automatisk textersättning: %s',
   'zh': u'機器人:執行文字代換作業 %s',

}


class XmlDumpReplacePageGenerator:

   """
   Iterator that will yield Pages that might contain text to replace.
   These pages will be retrieved from a local XML dump file.
   Arguments:
       * xmlFilename  - The dump's path, either absolute or relative
       * xmlStart     - Skip all articles in the dump before this one
       * replacements - A list of 2-tuples of original text (as a
                        compiled regular expression) and replacement
                        text (as a string).
       * exceptions   - A dictionary which defines when to ignore an
                        occurence. See docu of the ReplaceRobot
                        constructor below.
   """
   def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
       self.xmlFilename = xmlFilename
       self.replacements = replacements
       self.exceptions = exceptions
       self.xmlStart = xmlStart
       self.skipping = bool(xmlStart)
       self.excsInside = []
       if 'inside-tags' in self.exceptions:
           self.excsInside += self.exceptions['inside-tags']
       if 'inside' in self.exceptions:
           self.excsInside += self.exceptions['inside']
       import xmlreader
       self.site = wikipedia.getSite()
       dump = xmlreader.XmlDump(self.xmlFilename)
       self.parser = dump.parse()
   def __iter__(self):
       try:
           for entry in self.parser:
               if self.skipping:
                   if entry.title != self.xmlStart:
                       continue
                   self.skipping = False
               if not self.isTitleExcepted(entry.title) \
                       and not self.isTextExcepted(entry.text):
                   new_text = entry.text
                   for old, new in self.replacements:
                       new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site)
                   if new_text != entry.text:
                       yield wikipedia.Page(self.site, entry.title)
       except KeyboardInterrupt:
           try:
               if not self.skipping:
                   wikipedia.output(
                       u'To resume, use "-xmlstart:%s" on the command line.'
                       % entry.title)
           except NameError:
               pass
   def isTitleExcepted(self, title):
       if 'title' in self.exceptions:
           for exc in self.exceptions['title']:
               if exc.search(title):
                   return True
       if 'require-title' in self.exceptions:
           for req in self.exceptions['require-title']:
               if not req.search(title): # if not all requirements are met:
                   return True
       return False
   def isTextExcepted(self, text):
       if 'text-contains' in self.exceptions:
           for exc in self.exceptions['text-contains']:
               if exc.search(text):
                   return True
       return False


class ReplaceRobot:

   """
   A bot that can do text replacements.
   """
   def __init__(self, generator, replacements, exceptions={},
                acceptall=False, allowoverlap=False, recursive=False,
                addedCat=None, sleep=None, editSummary=):
       """
       Arguments:
           * generator    - A generator that yields Page objects.
           * replacements - A list of 2-tuples of original text (as a
                            compiled regular expression) and replacement
                            text (as a string).
           * exceptions   - A dictionary which defines when not to change an
                            occurence. See below.
           * acceptall    - If True, the user won't be prompted before changes
                            are made.
           * allowoverlap - If True, when matches overlap, all of them are
                            replaced.
           * addedCat     - If set to a value, add this category to every page
                            touched.
       Structure of the exceptions dictionary:
       This dictionary can have these keys:
           title
               A list of regular expressions. All pages with titles that
               are matched by one of these regular expressions are skipped.
           text-contains
               A list of regular expressions. All pages with text that
               contains a part which is matched by one of these regular
               expressions are skipped.
           inside
               A list of regular expressions. All occurences are skipped which
               lie within a text region which is matched by one of these
               regular expressions.
           inside-tags
               A list of strings. These strings must be keys from the
               exceptionRegexes dictionary in wikipedia.replaceExcept().
       """
       self.generator = generator
       self.replacements = replacements
       self.exceptions = exceptions
       self.acceptall = acceptall
       self.allowoverlap = allowoverlap
       self.recursive = recursive
       # Some function to set default editSummary should probably be added
       self.editSummary = editSummary
       if addedCat:
           site = wikipedia.getSite()
           cat_ns = site.category_namespaces()[0]
           self.addedCat = wikipedia.Page(site,
                                          cat_ns + ':' + addedCat)
       self.sleep = sleep
   def isTitleExcepted(self, title):
       """
       Iff one of the exceptions applies for the given title, returns True.
       """
       if 'title' in self.exceptions:
           for exc in self.exceptions['title']:
               if exc.search(title):
                   return True
       if 'require-title' in self.exceptions:
           for req in self.exceptions['require-title']:
               if not req.search(title):
                   return True
       return False
   def isTextExcepted(self, original_text):
       """
       Iff one of the exceptions applies for the given page contents,
       returns True.
       """
       if 'text-contains' in self.exceptions:
           for exc in self.exceptions['text-contains']:
               if exc.search(original_text):
                   return True
       return False
   def doReplacements(self, original_text):
       """
       Returns the text which is generated by applying all replacements to
       the given text.
       """
       new_text = original_text
       exceptions = []
       if 'inside-tags' in self.exceptions:
           exceptions += self.exceptions['inside-tags']
       if 'inside' in self.exceptions:
           exceptions += self.exceptions['inside']
       for old, new in self.replacements:
           if self.sleep is not None:
               time.sleep(self.sleep)
           new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
                                              allowoverlap=self.allowoverlap)
       return new_text
   def run(self):
       """
       Starts the robot.
       """
       # Run the generator which will yield Pages which might need to be
       # changed.
       for page in self.generator:
           if self.isTitleExcepted(page.title()):
               wikipedia.output(
                   u'Skipping %s because the title is on the exceptions list.'
                   % page.aslink())
               continue
           try:
               # Load the page's text from the wiki
               original_text = page.get(get_redirect=True)
               if not page.canBeEdited():
                   wikipedia.output(u"You can't edit page %s"
                                    % page.aslink())
                   continue
           except wikipedia.NoPage:
               wikipedia.output(u'Page %s not found' % page.aslink())
               continue
           new_text = original_text
           while True:
               if self.isTextExcepted(new_text):
                   wikipedia.output(
   u'Skipping %s because it contains text that is on the exceptions list.'
                       % page.aslink())
                   break
               new_text = self.doReplacements(new_text)
               if new_text == original_text:
                   wikipedia.output('No changes were necessary in %s'
                                    % page.aslink())
                   break
               if self.recursive:
                   newest_text = self.doReplacements(new_text)
                   while (newest_text!=new_text):
                       new_text = newest_text
                       newest_text = self.doReplacements(new_text)
               if hasattr(self, "addedCat"):
                   cats = page.categories(nofollow_redirects=True)
                   if self.addedCat not in cats:
                       cats.append(self.addedCat)
                       new_text = wikipedia.replaceCategoryLinks(new_text,
                                                                 cats)
               # Show the title of the page we're working on.
               # Highlight the title in purple.
               wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                                % page.title())
               wikipedia.showDiff(original_text, new_text)
               if self.acceptall:
                   break
               choice = wikipedia.inputChoice(
                           u'Do you want to accept these changes?',
                           ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
                           ['y', 'N', 'e', 'b', 'a', 'q'], 'N')
               if choice == 'e':
                   editor = editarticle.TextEditor()
                   as_edited = editor.edit(original_text)
                   # if user didn't press Cancel
                   if as_edited and as_edited != new_text:
                       new_text = as_edited
                   continue
               if choice == 'b':
                   webbrowser.open("http://%s%s" % (
                       page.site().hostname(),
                       page.site().nice_get_address(page.title())
                   ))
                   wikipedia.input("Press Enter when finished in browser.")
                   original_text = page.get(get_redirect=True, force=True)
                   new_text = original_text
                   continue
               if choice == 'q':
                   return
               if choice == 'a':
                   self.acceptall = True
               if choice == 'y':
                   page.put_async(new_text, self.editSummary)
               # choice must be 'N'
               break
           if self.acceptall and new_text != original_text:
               try:
                   page.put(new_text, self.editSummary)
               except wikipedia.EditConflict:
                   wikipedia.output(u'Skipping %s because of edit conflict'
                                    % (page.title(),))
               except wikipedia.SpamfilterError, e:
                   wikipedia.output(
                       u'Cannot change %s because of blacklist entry %s'
                       % (page.title(), e.url))
               except wikipedia.PageNotSaved, error:
                   wikipedia.output(u'Error putting page: %s'
                                    % (error.args,))
               except wikipedia.LockedPage:
                   wikipedia.output(u'Skipping %s (locked page)'
                                    % (page.title(),))

def prepareRegexForMySQL(pattern):

   pattern = pattern.replace('\s', '[:space:]')
   pattern = pattern.replace('\d', '[:digit:]')
   pattern = pattern.replace('\w', '[:alnum:]')
   pattern = pattern.replace("'", "\\" + "'")
   #pattern = pattern.replace('\\', '\\\\')
   #for char in ['[', ']', "'"]:
   #    pattern = pattern.replace(char, '\%s' % char)
   return pattern


def main(*args):

   add_cat = None
   gen = None
   # summary message
   summary_commandline = False
   # Array which will collect commandline parameters.
   # First element is original text, second element is replacement text.
   commandline_replacements = []
   # A list of 2-tuples of original text and replacement text.
   replacements = []
   # Don't edit pages which contain certain texts.
   exceptions = {
       'title':         [],
       'text-contains': [],
       'inside':        [],
       'inside-tags':   [],
       'require-title': [], # using a seperate requirements dict needs some
   }                        # major refactoring of code.
   # Should the elements of 'replacements' and 'exceptions' be interpreted
   # as regular expressions?
   regex = False
   # Predefined fixes from dictionary 'fixes' (see above).
   fix = None
   # the dump's path, either absolute or relative, which will be used
   # if -xml flag is present
   xmlFilename = None
   useSql = False
   PageTitles = []
   # will become True when the user presses a ('yes to all') or uses the
   # -always flag.
   acceptall = False
   # Will become True if the user inputs the commandline parameter -nocase
   caseInsensitive = False
   # Will become True if the user inputs the commandline parameter -dotall
   dotall = False
   # Will become True if the user inputs the commandline parameter -multiline
   multiline = False
   # Do all hits when they overlap
   allowoverlap = False
   # Do not recurse replacement
   recursive = False
   # This factory is responsible for processing command line arguments
   # that are also used by other scripts and that determine on which pages
   # to work on.
   genFactory = pagegenerators.GeneratorFactory()
   # Load default summary message.
   # BUG WARNING: This is probably incompatible with the -lang parameter.
   editSummary = wikipedia.translate(wikipedia.getSite(), msg)
   # Between a regex and another (using -fix) sleep some time (not to waste
   # too much CPU
   sleep = None
   # Read commandline parameters.
   for arg in wikipedia.handleArgs(*args):
       if arg == '-regex':
           regex = True
       elif arg.startswith('-xmlstart'):
           if len(arg) == 9:
               xmlStart = wikipedia.input(
                   u'Please enter the dumped article to start with:')
           else:
               xmlStart = arg[10:]
       elif arg.startswith('-xml'):
           if len(arg) == 4:
               xmlFilename = wikipedia.input(
                   u'Please enter the XML dump\'s filename:')
           else:
               xmlFilename = arg[5:]
       elif arg =='-sql':
           useSql = True
       elif arg.startswith('-page'):
           if len(arg) == 5:
               PageTitles.append(wikipedia.input(
                                   u'Which page do you want to change?'))
           else:
               PageTitles.append(arg[6:])
       elif arg.startswith('-excepttitle:'):
           exceptions['title'].append(arg[13:])
       elif arg.startswith('-requiretitle:'):
           exceptions['require-title'].append(arg[14:])
       elif arg.startswith('-excepttext:'):
           exceptions['text-contains'].append(arg[12:])
       elif arg.startswith('-exceptinside:'):
           exceptions['inside'].append(arg[14:])
       elif arg.startswith('-exceptinsidetag:'):
           exceptions['inside-tags'].append(arg[17:])
       elif arg.startswith('-fix:'):
           fix = arg[5:]
       elif arg.startswith('-sleep:'):
           sleep = float(arg[7:])
       elif arg == '-always':
           acceptall = True
       elif arg == '-recursive':
           recursive = True
       elif arg == '-nocase':
           caseInsensitive = True
       elif arg == '-dotall':
           dotall = True
       elif arg == '-multiline':
           multiline = True
       elif arg.startswith('-addcat:'):
           add_cat = arg[len('addcat:'):]
       elif arg.startswith('-summary:'):
           editSummary = arg[len('-summary:'):]
           summary_commandline = True
       elif arg.startswith('-allowoverlap'):
           allowoverlap = True
       else:
           if not genFactory.handleArg(arg):
               commandline_replacements.append(arg)
   if (len(commandline_replacements) % 2):
       raise wikipedia.Error, 'require even number of replacements.'
   elif (len(commandline_replacements) == 2 and fix is None):
       replacements.append((commandline_replacements[0],
                            commandline_replacements[1]))
       if summary_commandline == False:
           editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +'
                                  + commandline_replacements[1] + ')')
   elif (len(commandline_replacements) > 1):
       if (fix is None):
           for i in xrange (0, len(commandline_replacements), 2):
               replacements.append((commandline_replacements[i],
                                    commandline_replacements[i + 1]))
           if summary_commandline == False:
               pairs = [( commandline_replacements[i],
                          commandline_replacements[i + 1] )
                        for i in range(0, len(commandline_replacements), 2)]
               replacementsDescription = '(%s)' % ', '.join(
                   [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
               editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % replacementsDescription
       else:
          raise wikipedia.Error(
              'Specifying -fix with replacements is undefined')
   elif fix is None:
       old = wikipedia.input(u'Please enter the text that should be replaced:')
       new = wikipedia.input(u'Please enter the new text:')
       change = '(-' + old + ' +' + new
       replacements.append((old, new))
       while True:
           old = wikipedia.input(

u'Please enter another text that should be replaced, or press Enter to start:')

           if old == :
               change = change + ')'
               break
           new = wikipedia.input(u'Please enter the new text:')
           change = change + ' & -' + old + ' +' + new
           replacements.append((old, new))
       if not summary_commandline == True:
           default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
           wikipedia.output(u'The summary message will default to: %s'
                            % default_summary_message)
           summary_message = wikipedia.input(

u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')

           if summary_message == :
               summary_message = default_summary_message
           editSummary = summary_message
   else:
       # Perform one of the predefined actions.
       try:
           fix = fixes.fixes[fix]
       except KeyError:
           wikipedia.output(u'Available predefined fixes are: %s'
                            % fixes.fixes.keys())
           return
       if 'regex' in fix:
           regex = fix['regex']
       if 'msg' in fix:
           editSummary = wikipedia.translate(wikipedia.getSite(), fix['msg'])
       if 'exceptions' in fix:
           exceptions = fix['exceptions']
       if 'nocase' in fix:
           caseInsensitive = fix['nocase']
       replacements = fix['replacements']
   #Set the regular expression flags
   flags = re.UNICODE
   if caseInsensitive:
       flags = flags | re.IGNORECASE
   if dotall:
       flags = flags | re.DOTALL
   if multiline:
       flags = flags | re.MULTILINE
   # Pre-compile all regular expressions here to save time later
   for i in range(len(replacements)):
       old, new = replacements[i]
       if not regex:
           old = re.escape(old)
       oldR = re.compile(old, flags)
       replacements[i] = oldR, new
   for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']:
       if exceptionCategory in exceptions:
           patterns = exceptions[exceptionCategory]
           if not regex:
               patterns = [re.escape(pattern) for pattern in patterns]
           patterns = [re.compile(pattern, flags) for pattern in patterns]
           exceptions[exceptionCategory] = patterns
   if xmlFilename:
       try:
           xmlStart
       except NameError:
           xmlStart = None
       gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart,
                                         replacements, exceptions)
   elif useSql:
       whereClause = 'WHERE (%s)' % ' OR '.join(
           ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
            for (old, new) in replacements])
       if exceptions:
           exceptClause = 'AND NOT (%s)' % ' OR '.join(
               ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern)
                for exc in exceptions])
       else:
           exceptClause = 
       query = u"""

SELECT page_namespace, page_title FROM page JOIN text ON (page_id = old_id) %s %s LIMIT 200""" % (whereClause, exceptClause)

       gen = pagegenerators.MySQLPageGenerator(query)
   elif PageTitles:
       pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
                for PageTitle in PageTitles]
       gen = iter(pages)
   gen = genFactory.getCombinedGenerator(gen)
   if not gen:
       # syntax error, show help text from the top of this file
       wikipedia.showHelp('replace')
       return
   if xmlFilename:
       # XML parsing can be quite slow, so use smaller batches and
       # longer lookahead.
       preloadingGen = pagegenerators.PreloadingGenerator(gen,
                                           pageNumber=20, lookahead=100)
   else:
       preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
   bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary)
   bot.run()

if __name__ == "__main__":

   try:
       main()
   finally:
       wikipedia.stopme()
Cookies help us deliver our services. By using our services, you agree to our use of cookies.