(my mod) |
(pre tags - new version of replace.py) |
||
Line 1: | Line 1: | ||
Standard replace.py, June 2010: | |||
<pre> | |||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||
""" | """ | ||
Line 65: | Line 66: | ||
resources. This will slow it down between a regex and another | resources. This will slow it down between a regex and another | ||
in order not to waste too much CPU. | in order not to waste too much CPU. | ||
-query: The maximum number of pages that the bot will load at once. | |||
Default value is 60. Ignored when reading an XML file. | |||
-fix:XYZ Perform one of the predefined replacements tasks, which are | -fix:XYZ Perform one of the predefined replacements tasks, which are | ||
Line 86: | Line 90: | ||
will be regarded as a regular expression, and the second | will be regarded as a regular expression, and the second | ||
argument might contain expressions like \\1 or \g<name>. | argument might contain expressions like \\1 or \g<name>. | ||
It is possible to introduce more than one pair of old text | |||
and replacement. | |||
Examples: | Examples: | ||
Line 99: | Line 105: | ||
python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0 | python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0 | ||
If you want to do more than one replacement at a time, use this: | |||
python replace.py -xml:foobar.xml "Errror" "Error" "Faail" "Fail" -namespace:0 | |||
If you have a page called 'John Doe' and want to fix the format of ISBNs, use: | If you have a page called 'John Doe' and want to fix the format of ISBNs, use: | ||
Line 109: | Line 118: | ||
python replace.py referer referrer -file:typos.txt -excepttext:HTTP | python replace.py referer referrer -file:typos.txt -excepttext:HTTP | ||
""" | """ | ||
from __future__ import generators | |||
# | # | ||
# (C) Daniel Herding & the | # (C) Daniel Herding & the Pywikipedia team, 2004-2009 | ||
# | |||
__version__='$Id: replace.py 7695 2009-11-26 09:28:38Z alexsh $' | |||
# | # | ||
# Distributed under the terms of the MIT license. | # Distributed under the terms of the MIT license. | ||
# | # | ||
import sys, re, time | import sys, re, time | ||
import wikipedia | import wikipedia as pywikibot | ||
import pagegenerators | |||
import editarticle | import editarticle | ||
import webbrowser | import webbrowser | ||
Line 131: | Line 143: | ||
} | } | ||
# Summary messages in different languages | # Summary messages in different languages | ||
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes' | # NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes' | ||
# below. | # below. | ||
msg = { | msg = { | ||
'ar': u'%s روبوت : استبدال تلقائي للنص', | 'ar': u'%s روبوت : استبدال تلقائي للنص', | ||
Line 168: | Line 179: | ||
'sr': u'Бот: Аутоматска замена текста %s', | 'sr': u'Бот: Аутоматска замена текста %s', | ||
'sv': u'Bot: Automatisk textersättning: %s', | 'sv': u'Bot: Automatisk textersättning: %s', | ||
'uk': u'Бот: Автоматизована заміна тексту: %s', | |||
'zh': u'機器人:執行文字代換作業 %s', | 'zh': u'機器人:執行文字代換作業 %s', | ||
} | } | ||
Line 196: | Line 208: | ||
self.excsInside = [] | self.excsInside = [] | ||
if | if "inside-tags" in self.exceptions: | ||
self.excsInside += self.exceptions['inside-tags'] | self.excsInside += self.exceptions['inside-tags'] | ||
if | if "inside" in self.exceptions: | ||
self.excsInside += self.exceptions['inside'] | self.excsInside += self.exceptions['inside'] | ||
import xmlreader | import xmlreader | ||
self.site = | self.site = pywikibot.getSite() | ||
dump = xmlreader.XmlDump(self.xmlFilename) | dump = xmlreader.XmlDump(self.xmlFilename) | ||
self.parser = dump.parse() | self.parser = dump.parse() | ||
Line 216: | Line 228: | ||
new_text = entry.text | new_text = entry.text | ||
for old, new in self.replacements: | for old, new in self.replacements: | ||
new_text = | new_text = pywikibot.replaceExcept(new_text, old, new, self.excsInside, self.site) | ||
if new_text != entry.text: | if new_text != entry.text: | ||
yield | yield pywikibot.Page(self.site, entry.title) | ||
except KeyboardInterrupt: | except KeyboardInterrupt: | ||
try: | try: | ||
if not self.skipping: | if not self.skipping: | ||
pywikibot.output( | |||
u'To resume, use "-xmlstart:%s" on the command line.' | u'To resume, use "-xmlstart:%s" on the command line.' | ||
% entry.title) | % entry.title) | ||
Line 229: | Line 241: | ||
def isTitleExcepted(self, title): | def isTitleExcepted(self, title): | ||
if | if "title" in self.exceptions: | ||
for exc in self.exceptions['title']: | for exc in self.exceptions['title']: | ||
if exc.search(title): | if exc.search(title): | ||
return True | return True | ||
if | if "require-title" in self.exceptions: | ||
for req in self.exceptions['require-title']: | for req in self.exceptions['require-title']: | ||
if not req.search(title): # if not all requirements are met: | if not req.search(title): # if not all requirements are met: | ||
Line 241: | Line 253: | ||
def isTextExcepted(self, text): | def isTextExcepted(self, text): | ||
if | if "text-contains" in self.exceptions: | ||
for exc in self.exceptions['text-contains']: | for exc in self.exceptions['text-contains']: | ||
if exc.search(text): | if exc.search(text): | ||
Line 286: | Line 298: | ||
inside-tags | inside-tags | ||
A list of strings. These strings must be keys from the | A list of strings. These strings must be keys from the | ||
exceptionRegexes dictionary in | exceptionRegexes dictionary in pywikibot.replaceExcept(). | ||
""" | """ | ||
Line 295: | Line 307: | ||
self.allowoverlap = allowoverlap | self.allowoverlap = allowoverlap | ||
self.recursive = recursive | self.recursive = recursive | ||
if addedCat: | |||
site = pywikibot.getSite() | |||
self.addedCat = pywikibot.Page(site, addedCat, defaultNamespace=14) | |||
self.sleep = sleep | |||
# Some function to set default editSummary should probably be added | # Some function to set default editSummary should probably be added | ||
self.editSummary = editSummary | self.editSummary = editSummary | ||
def isTitleExcepted(self, title): | def isTitleExcepted(self, title): | ||
Line 308: | Line 318: | ||
Iff one of the exceptions applies for the given title, returns True. | Iff one of the exceptions applies for the given title, returns True. | ||
""" | """ | ||
if | if "title" in self.exceptions: | ||
for exc in self.exceptions['title']: | for exc in self.exceptions['title']: | ||
if exc.search(title): | if exc.search(title): | ||
return True | return True | ||
if | if "require-title" in self.exceptions: | ||
for req in self.exceptions['require-title']: | for req in self.exceptions['require-title']: | ||
if not req.search(title): | if not req.search(title): | ||
Line 323: | Line 333: | ||
returns True. | returns True. | ||
""" | """ | ||
if | if "text-contains" in self.exceptions: | ||
for exc in self.exceptions['text-contains']: | for exc in self.exceptions['text-contains']: | ||
if exc.search(original_text): | if exc.search(original_text): | ||
Line 336: | Line 346: | ||
new_text = original_text | new_text = original_text | ||
exceptions = [] | exceptions = [] | ||
if | if "inside-tags" in self.exceptions: | ||
exceptions += self.exceptions['inside-tags'] | exceptions += self.exceptions['inside-tags'] | ||
if | if "inside" in self.exceptions: | ||
exceptions += self.exceptions['inside'] | exceptions += self.exceptions['inside'] | ||
for old, new in self.replacements: | for old, new in self.replacements: | ||
if self.sleep is not None: | if self.sleep is not None: | ||
time.sleep(self.sleep) | time.sleep(self.sleep) | ||
new_text = | new_text = pywikibot.replaceExcept(new_text, old, new, exceptions, | ||
allowoverlap=self.allowoverlap) | allowoverlap=self.allowoverlap) | ||
return new_text | return new_text | ||
Line 355: | Line 365: | ||
for page in self.generator: | for page in self.generator: | ||
if self.isTitleExcepted(page.title()): | if self.isTitleExcepted(page.title()): | ||
pywikibot.output( | |||
u'Skipping %s because the title is on the exceptions list.' | u'Skipping %s because the title is on the exceptions list.' | ||
% page.aslink()) | % page.aslink()) | ||
Line 363: | Line 373: | ||
original_text = page.get(get_redirect=True) | original_text = page.get(get_redirect=True) | ||
if not page.canBeEdited(): | if not page.canBeEdited(): | ||
pywikibot.output(u"You can't edit page %s" | |||
% page.aslink()) | % page.aslink()) | ||
continue | continue | ||
except | except pywikibot.NoPage: | ||
pywikibot.output(u'Page %s not found' % page.aslink()) | |||
continue | continue | ||
new_text = original_text | new_text = original_text | ||
while True: | while True: | ||
if self.isTextExcepted(new_text): | if self.isTextExcepted(new_text): | ||
pywikibot.output( | |||
u'Skipping %s because it contains text that is on the exceptions list.' | u'Skipping %s because it contains text that is on the exceptions list.' | ||
% page.aslink()) | % page.aslink()) | ||
Line 378: | Line 388: | ||
new_text = self.doReplacements(new_text) | new_text = self.doReplacements(new_text) | ||
if new_text == original_text: | if new_text == original_text: | ||
pywikibot.output(u'No changes were necessary in %s' | |||
% page.aslink()) | |||
break | break | ||
if self.recursive: | if self.recursive: | ||
Line 387: | Line 397: | ||
newest_text = self.doReplacements(new_text) | newest_text = self.doReplacements(new_text) | ||
if hasattr(self, "addedCat"): | if hasattr(self, "addedCat"): | ||
cats = page.categories( | cats = page.categories() | ||
if self.addedCat not in cats: | if self.addedCat not in cats: | ||
cats.append(self.addedCat) | cats.append(self.addedCat) | ||
new_text = | new_text = pywikibot.replaceCategoryLinks(new_text, | ||
cats) | cats) | ||
# Show the title of the page we're working on. | # Show the title of the page we're working on. | ||
# Highlight the title in purple. | # Highlight the title in purple. | ||
pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" | |||
% page.title()) | % page.title()) | ||
pywikibot.showDiff(original_text, new_text) | |||
if self.acceptall: | if self.acceptall: | ||
break | break | ||
choice = | choice = pywikibot.inputChoice( | ||
u'Do you want to accept these changes?', | u'Do you want to accept these changes?', | ||
['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"], | ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"], | ||
Line 415: | Line 425: | ||
page.site().nice_get_address(page.title()) | page.site().nice_get_address(page.title()) | ||
)) | )) | ||
pywikibot.input("Press Enter when finished in browser.") | |||
original_text = page.get(get_redirect=True, force=True) | original_text = page.get(get_redirect=True, force=True) | ||
new_text = original_text | new_text = original_text | ||
Line 430: | Line 440: | ||
try: | try: | ||
page.put(new_text, self.editSummary) | page.put(new_text, self.editSummary) | ||
except | except pywikibot.EditConflict: | ||
pywikibot.output(u'Skipping %s because of edit conflict' | |||
% (page.title(),)) | % (page.title(),)) | ||
except | except pywikibot.SpamfilterError, e: | ||
pywikibot.output( | |||
u'Cannot change %s because of blacklist entry %s' | u'Cannot change %s because of blacklist entry %s' | ||
% (page.title(), e.url)) | % (page.title(), e.url)) | ||
except | except pywikibot.PageNotSaved, error: | ||
pywikibot.output(u'Error putting page: %s' | |||
% (error.args,)) | % (error.args,)) | ||
except | except pywikibot.LockedPage: | ||
pywikibot.output(u'Skipping %s (locked page)' | |||
% (page.title(),)) | % (page.title(),)) | ||
Line 498: | Line 508: | ||
# Do not recurse replacement | # Do not recurse replacement | ||
recursive = False | recursive = False | ||
# This is the maximum number of pages to load per query | |||
maxquerysize = 60 | |||
# This factory is responsible for processing command line arguments | # This factory is responsible for processing command line arguments | ||
# that are also used by other scripts and that determine on which pages | # that are also used by other scripts and that determine on which pages | ||
Line 504: | Line 516: | ||
# Load default summary message. | # Load default summary message. | ||
# BUG WARNING: This is probably incompatible with the -lang parameter. | # BUG WARNING: This is probably incompatible with the -lang parameter. | ||
editSummary = | editSummary = pywikibot.translate(pywikibot.getSite(), msg) | ||
# Between a regex and another (using -fix) sleep some time (not to waste | # Between a regex and another (using -fix) sleep some time (not to waste | ||
# too much CPU | # too much CPU | ||
Line 510: | Line 522: | ||
# Read commandline parameters. | # Read commandline parameters. | ||
for arg in | for arg in pywikibot.handleArgs(*args): | ||
if arg == '-regex': | if arg == '-regex': | ||
regex = True | regex = True | ||
elif arg.startswith('-xmlstart'): | elif arg.startswith('-xmlstart'): | ||
if len(arg) == 9: | if len(arg) == 9: | ||
xmlStart = | xmlStart = pywikibot.input( | ||
u'Please enter the dumped article to start with:') | u'Please enter the dumped article to start with:') | ||
else: | else: | ||
Line 521: | Line 533: | ||
elif arg.startswith('-xml'): | elif arg.startswith('-xml'): | ||
if len(arg) == 4: | if len(arg) == 4: | ||
xmlFilename = | xmlFilename = pywikibot.input( | ||
u'Please enter the XML dump\'s filename:') | u'Please enter the XML dump\'s filename:') | ||
else: | else: | ||
Line 529: | Line 541: | ||
elif arg.startswith('-page'): | elif arg.startswith('-page'): | ||
if len(arg) == 5: | if len(arg) == 5: | ||
PageTitles.append( | PageTitles.append(pywikibot.input( | ||
u'Which page do you want to change?')) | u'Which page do you want to change?')) | ||
else: | else: | ||
Line 558: | Line 570: | ||
multiline = True | multiline = True | ||
elif arg.startswith('-addcat:'): | elif arg.startswith('-addcat:'): | ||
add_cat = arg[ | add_cat = arg[8:] | ||
elif arg.startswith('-summary:'): | elif arg.startswith('-summary:'): | ||
editSummary = arg[ | editSummary = arg[9:] | ||
summary_commandline = True | summary_commandline = True | ||
elif arg.startswith('-allowoverlap'): | elif arg.startswith('-allowoverlap'): | ||
allowoverlap = True | allowoverlap = True | ||
elif arg.startswith('-query:'): | |||
maxquerysize = int(arg[7:]) | |||
else: | else: | ||
if not genFactory.handleArg(arg): | if not genFactory.handleArg(arg): | ||
Line 569: | Line 583: | ||
if (len(commandline_replacements) % 2): | if (len(commandline_replacements) % 2): | ||
raise | raise pywikibot.Error, 'require even number of replacements.' | ||
elif (len(commandline_replacements) == 2 and fix is None): | elif (len(commandline_replacements) == 2 and fix is None): | ||
replacements.append((commandline_replacements[0], | replacements.append((commandline_replacements[0], | ||
commandline_replacements[1])) | commandline_replacements[1])) | ||
if summary_commandline | if not summary_commandline: | ||
editSummary = | editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +' | ||
+ commandline_replacements[1] + ')') | + commandline_replacements[1] + ')') | ||
elif (len(commandline_replacements) > 1): | elif (len(commandline_replacements) > 1): | ||
Line 581: | Line 595: | ||
replacements.append((commandline_replacements[i], | replacements.append((commandline_replacements[i], | ||
commandline_replacements[i + 1])) | commandline_replacements[i + 1])) | ||
if summary_commandline | if not summary_commandline: | ||
pairs = [( commandline_replacements[i], | pairs = [( commandline_replacements[i], | ||
commandline_replacements[i + 1] ) | commandline_replacements[i + 1] ) | ||
Line 587: | Line 601: | ||
replacementsDescription = '(%s)' % ', '.join( | replacementsDescription = '(%s)' % ', '.join( | ||
[('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) | [('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) | ||
editSummary = | editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % replacementsDescription | ||
else: | else: | ||
raise | raise pywikibot.Error( | ||
'Specifying -fix with replacements is undefined') | 'Specifying -fix with replacements is undefined') | ||
elif fix is None: | elif fix is None: | ||
old = | old = pywikibot.input(u'Please enter the text that should be replaced:') | ||
new = | new = pywikibot.input(u'Please enter the new text:') | ||
change = '(-' + old + ' +' + new | change = '(-' + old + ' +' + new | ||
replacements.append((old, new)) | replacements.append((old, new)) | ||
while True: | while True: | ||
old = | old = pywikibot.input( | ||
u'Please enter another text that should be replaced, or press Enter to start:') | u'Please enter another text that should be replaced, or press Enter to start:') | ||
if old == '': | if old == '': | ||
change = change + ')' | change = change + ')' | ||
break | break | ||
new = | new = pywikibot.input(u'Please enter the new text:') | ||
change = change + ' & -' + old + ' +' + new | change = change + ' & -' + old + ' +' + new | ||
replacements.append((old, new)) | replacements.append((old, new)) | ||
if not summary_commandline | if not summary_commandline: | ||
default_summary_message = | default_summary_message = pywikibot.translate(pywikibot.getSite(), msg) % change | ||
pywikibot.output(u'The summary message will default to: %s' | |||
% default_summary_message) | % default_summary_message) | ||
summary_message = | summary_message = pywikibot.input( | ||
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:') | u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:') | ||
if summary_message == '': | if summary_message == '': | ||
Line 620: | Line 634: | ||
fix = fixes.fixes[fix] | fix = fixes.fixes[fix] | ||
except KeyError: | except KeyError: | ||
pywikibot.output(u'Available predefined fixes are: %s' | |||
% fixes.fixes.keys()) | % fixes.fixes.keys()) | ||
return | return | ||
if | if "regex" in fix: | ||
regex = fix['regex'] | regex = fix['regex'] | ||
if | if "msg" in fix: | ||
editSummary = | editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg']) | ||
if | if "exceptions" in fix: | ||
exceptions = fix['exceptions'] | exceptions = fix['exceptions'] | ||
if | if "nocase" in fix: | ||
caseInsensitive = fix['nocase'] | caseInsensitive = fix['nocase'] | ||
replacements = fix['replacements'] | replacements = fix['replacements'] | ||
Line 684: | Line 698: | ||
gen = pagegenerators.MySQLPageGenerator(query) | gen = pagegenerators.MySQLPageGenerator(query) | ||
elif PageTitles: | elif PageTitles: | ||
pages = [ | pages = [pywikibot.Page(pywikibot.getSite(), PageTitle) | ||
for PageTitle in PageTitles] | for PageTitle in PageTitles] | ||
gen = iter(pages) | gen = iter(pages) | ||
Line 691: | Line 705: | ||
if not gen: | if not gen: | ||
# syntax error, show help text from the top of this file | # syntax error, show help text from the top of this file | ||
pywikibot.showHelp('replace') | |||
return | return | ||
if xmlFilename: | if xmlFilename: | ||
Line 699: | Line 713: | ||
pageNumber=20, lookahead=100) | pageNumber=20, lookahead=100) | ||
else: | else: | ||
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber= | preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize) | ||
bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary) | bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary) | ||
bot.run() | bot.run() | ||
if __name__ == "__main__": | if __name__ == "__main__": | ||
Line 707: | Line 722: | ||
main() | main() | ||
finally: | finally: | ||
pywikibot.stopme() | |||
</pre> |
Revision as of 08:40, 8 June 2010
Standard replace.py, June 2010:
# -*- coding: utf-8 -*- """ This bot will make direct text replacements. It will retrieve information on which pages might need changes either from an XML dump or a text file, or only change a single page. These command line parameters can be used to specify which pages to work on: ¶ms; -xml Retrieve information from a local XML dump (pages-articles or pages-meta-current, see http://download.wikimedia.org). Argument can also be given as "-xml:filename". -page Only edit a specific page. Argument can also be given as "-page:pagetitle". You can give this parameter multiple times to edit multiple pages. Furthermore, the following command line parameters are supported: -regex Make replacements using regular expressions. If this argument isn't given, the bot will make simple text replacements. -nocase Use case insensitive regular expressions. -dotall Make the dot match any character at all, including a newline. Without this flag, '.' will match anything except a newline. -multiline '^' and '$' will now match begin and end of each line. -xmlstart (Only works with -xml) Skip all articles in the XML dump before the one specified (may also be given as -xmlstart:Article). -addcat:cat_name Adds "cat_name" category to every altered page. -excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression. -requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression. -excepttext:XYZ Skip pages which contain the text XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression. -exceptinside:XYZ Skip occurences of the to-be-replaced text which lie within XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression. -exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie within an XYZ tag. -summary:XYZ Set the summary message text for the edit to XYZ, bypassing the predefined message texts with original and replacements inserted. -sleep:123 If you use -fix you can check multiple regex at the same time in every page. This can lead to a great waste of CPU because the bot will check every regex without waiting using all the resources. This will slow it down between a regex and another in order not to waste too much CPU. -query: The maximum number of pages that the bot will load at once. Default value is 60. Ignored when reading an XML file. -fix:XYZ Perform one of the predefined replacements tasks, which are given in the dictionary 'fixes' defined inside the file fixes.py. The -regex and -nocase argument and given replacements will be ignored if you use -fix. Currently available predefined fixes are: &fixes-help; -always Don't prompt you for each replacement -recursive Recurse replacement as long as possible. Be careful, this might lead to an infinite loop. -allowoverlap When occurences of the pattern overlap, replace all of them. Be careful, this might lead to an infinite loop. other: First argument is the old text, second argument is the new text. If the -regex argument is given, the first argument will be regarded as a regular expression, and the second argument might contain expressions like \\1 or \g<name>. It is possible to introduce more than one pair of old text and replacement. Examples: If you want to change templates from the old syntax, e.g. {{msg:Stub}}, to the new syntax, e.g. {{Stub}}, download an XML dump file (pages-articles) from http://download.wikimedia.org, then use this command: python replace.py -xml -regex "{{msg:(.*?)}}" "{{\\1}}" If you have a dump called foobar.xml and want to fix typos in articles, e.g. Errror -> Error, use this: python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0 If you want to do more than one replacement at a time, use this: python replace.py -xml:foobar.xml "Errror" "Error" "Faail" "Fail" -namespace:0 If you have a page called 'John Doe' and want to fix the format of ISBNs, use: python replace.py -page:John_Doe -fix:isbn This command will change 'referer' to 'referrer', but not in pages which talk about HTTP, where the typo has become part of the standard: python replace.py referer referrer -file:typos.txt -excepttext:HTTP """ from __future__ import generators # # (C) Daniel Herding & the Pywikipedia team, 2004-2009 # __version__='$Id: replace.py 7695 2009-11-26 09:28:38Z alexsh $' # # Distributed under the terms of the MIT license. # import sys, re, time import wikipedia as pywikibot import pagegenerators import editarticle import webbrowser # Imports predefined replacements tasks from fixes.py import fixes # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { '¶ms;': pagegenerators.parameterHelp, '&fixes-help;': fixes.help, } # Summary messages in different languages # NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes' # below. msg = { 'ar': u'%s روبوت : استبدال تلقائي للنص', 'ca': u'Robot: Reemplaçament automàtic de text %s', 'cs': u'Robot automaticky nahradil text: %s', 'de': u'Bot: Automatisierte Textersetzung %s', 'el': u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s', 'en': u'Robot: Automated text replacement %s', 'es': u'Robot: Reemplazo automático de texto %s', 'fa': u'ربات: تغییر خودکار متن %s', 'fi': u'Botti korvasi automaattisesti tekstin %s', 'fr': u'Robot : Remplacement de texte automatisé %s', 'he': u'בוט: החלפת טקסט אוטומטית %s', 'hu': u'Robot: Automatikus szövegcsere %s', 'ia': u'Robot: Reimplaciamento automatic de texto %s', 'id': u'Bot: Penggantian teks otomatis %s', 'is': u'Vélmenni: breyti texta %s', 'it': u'Bot: Sostituzione automatica %s', 'ja': u'ロボットによる: 文字置き換え %s', 'ka': u'რობოტი: ტექსტის ავტომატური შეცვლა %s', 'kk': u'Бот: Мәтінді өздікті алмастырды: %s', 'ksh': u'Bot: hät outomatesch Täx jetuusch: %s', 'lt': u'robotas: Automatinis teksto keitimas %s', 'nds': u'Bot: Text automaatsch utwesselt: %s', 'nds-nl': u'Bot: autematisch tekse vervungen %s', 'nl': u'Bot: automatisch tekst vervangen %s', 'nn': u'robot: automatisk teksterstatning: %s', 'no': u'robot: automatisk teksterstatning: %s', 'pl': u'Robot automatycznie zamienia tekst %s', 'pt': u'Bot: Mudança automática %s', 'ru': u'Робот: Автоматизированная замена текста %s', 'sr': u'Бот: Аутоматска замена текста %s', 'sv': u'Bot: Automatisk textersättning: %s', 'uk': u'Бот: Автоматизована заміна тексту: %s', 'zh': u'機器人:執行文字代換作業 %s', } class XmlDumpReplacePageGenerator: """ Iterator that will yield Pages that might contain text to replace. These pages will be retrieved from a local XML dump file. Arguments: * xmlFilename - The dump's path, either absolute or relative * xmlStart - Skip all articles in the dump before this one * replacements - A list of 2-tuples of original text (as a compiled regular expression) and replacement text (as a string). * exceptions - A dictionary which defines when to ignore an occurence. See docu of the ReplaceRobot constructor below. """ def __init__(self, xmlFilename, xmlStart, replacements, exceptions): self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions self.xmlStart = xmlStart self.skipping = bool(xmlStart) self.excsInside = [] if "inside-tags" in self.exceptions: self.excsInside += self.exceptions['inside-tags'] if "inside" in self.exceptions: self.excsInside += self.exceptions['inside'] import xmlreader self.site = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlFilename) self.parser = dump.parse() def __iter__(self): try: for entry in self.parser: if self.skipping: if entry.title != self.xmlStart: continue self.skipping = False if not self.isTitleExcepted(entry.title) \ and not self.isTextExcepted(entry.text): new_text = entry.text for old, new in self.replacements: new_text = pywikibot.replaceExcept(new_text, old, new, self.excsInside, self.site) if new_text != entry.text: yield pywikibot.Page(self.site, entry.title) except KeyboardInterrupt: try: if not self.skipping: pywikibot.output( u'To resume, use "-xmlstart:%s" on the command line.' % entry.title) except NameError: pass def isTitleExcepted(self, title): if "title" in self.exceptions: for exc in self.exceptions['title']: if exc.search(title): return True if "require-title" in self.exceptions: for req in self.exceptions['require-title']: if not req.search(title): # if not all requirements are met: return True return False def isTextExcepted(self, text): if "text-contains" in self.exceptions: for exc in self.exceptions['text-contains']: if exc.search(text): return True return False class ReplaceRobot: """ A bot that can do text replacements. """ def __init__(self, generator, replacements, exceptions={}, acceptall=False, allowoverlap=False, recursive=False, addedCat=None, sleep=None, editSummary=''): """ Arguments: * generator - A generator that yields Page objects. * replacements - A list of 2-tuples of original text (as a compiled regular expression) and replacement text (as a string). * exceptions - A dictionary which defines when not to change an occurence. See below. * acceptall - If True, the user won't be prompted before changes are made. * allowoverlap - If True, when matches overlap, all of them are replaced. * addedCat - If set to a value, add this category to every page touched. Structure of the exceptions dictionary: This dictionary can have these keys: title A list of regular expressions. All pages with titles that are matched by one of these regular expressions are skipped. text-contains A list of regular expressions. All pages with text that contains a part which is matched by one of these regular expressions are skipped. inside A list of regular expressions. All occurences are skipped which lie within a text region which is matched by one of these regular expressions. inside-tags A list of strings. These strings must be keys from the exceptionRegexes dictionary in pywikibot.replaceExcept(). """ self.generator = generator self.replacements = replacements self.exceptions = exceptions self.acceptall = acceptall self.allowoverlap = allowoverlap self.recursive = recursive if addedCat: site = pywikibot.getSite() self.addedCat = pywikibot.Page(site, addedCat, defaultNamespace=14) self.sleep = sleep # Some function to set default editSummary should probably be added self.editSummary = editSummary def isTitleExcepted(self, title): """ Iff one of the exceptions applies for the given title, returns True. """ if "title" in self.exceptions: for exc in self.exceptions['title']: if exc.search(title): return True if "require-title" in self.exceptions: for req in self.exceptions['require-title']: if not req.search(title): return True return False def isTextExcepted(self, original_text): """ Iff one of the exceptions applies for the given page contents, returns True. """ if "text-contains" in self.exceptions: for exc in self.exceptions['text-contains']: if exc.search(original_text): return True return False def doReplacements(self, original_text): """ Returns the text which is generated by applying all replacements to the given text. """ new_text = original_text exceptions = [] if "inside-tags" in self.exceptions: exceptions += self.exceptions['inside-tags'] if "inside" in self.exceptions: exceptions += self.exceptions['inside'] for old, new in self.replacements: if self.sleep is not None: time.sleep(self.sleep) new_text = pywikibot.replaceExcept(new_text, old, new, exceptions, allowoverlap=self.allowoverlap) return new_text def run(self): """ Starts the robot. """ # Run the generator which will yield Pages which might need to be # changed. for page in self.generator: if self.isTitleExcepted(page.title()): pywikibot.output( u'Skipping %s because the title is on the exceptions list.' % page.aslink()) continue try: # Load the page's text from the wiki original_text = page.get(get_redirect=True) if not page.canBeEdited(): pywikibot.output(u"You can't edit page %s" % page.aslink()) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.aslink()) continue new_text = original_text while True: if self.isTextExcepted(new_text): pywikibot.output( u'Skipping %s because it contains text that is on the exceptions list.' % page.aslink()) break new_text = self.doReplacements(new_text) if new_text == original_text: pywikibot.output(u'No changes were necessary in %s' % page.aslink()) break if self.recursive: newest_text = self.doReplacements(new_text) while (newest_text!=new_text): new_text = newest_text newest_text = self.doReplacements(new_text) if hasattr(self, "addedCat"): cats = page.categories() if self.addedCat not in cats: cats.append(self.addedCat) new_text = pywikibot.replaceCategoryLinks(new_text, cats) # Show the title of the page we're working on. # Highlight the title in purple. pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(original_text, new_text) if self.acceptall: break choice = pywikibot.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"], ['y', 'N', 'e', 'b', 'a', 'q'], 'N') if choice == 'e': editor = editarticle.TextEditor() as_edited = editor.edit(original_text) # if user didn't press Cancel if as_edited and as_edited != new_text: new_text = as_edited continue if choice == 'b': webbrowser.open("http://%s%s" % ( page.site().hostname(), page.site().nice_get_address(page.title()) )) pywikibot.input("Press Enter when finished in browser.") original_text = page.get(get_redirect=True, force=True) new_text = original_text continue if choice == 'q': return if choice == 'a': self.acceptall = True if choice == 'y': page.put_async(new_text, self.editSummary) # choice must be 'N' break if self.acceptall and new_text != original_text: try: page.put(new_text, self.editSummary) except pywikibot.EditConflict: pywikibot.output(u'Skipping %s because of edit conflict' % (page.title(),)) except pywikibot.SpamfilterError, e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) except pywikibot.PageNotSaved, error: pywikibot.output(u'Error putting page: %s' % (error.args,)) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % (page.title(),)) def prepareRegexForMySQL(pattern): pattern = pattern.replace('\s', '[:space:]') pattern = pattern.replace('\d', '[:digit:]') pattern = pattern.replace('\w', '[:alnum:]') pattern = pattern.replace("'", "\\" + "'") #pattern = pattern.replace('\\', '\\\\') #for char in ['[', ']', "'"]: # pattern = pattern.replace(char, '\%s' % char) return pattern def main(*args): add_cat = None gen = None # summary message summary_commandline = False # Array which will collect commandline parameters. # First element is original text, second element is replacement text. commandline_replacements = [] # A list of 2-tuples of original text and replacement text. replacements = [] # Don't edit pages which contain certain texts. exceptions = { 'title': [], 'text-contains': [], 'inside': [], 'inside-tags': [], 'require-title': [], # using a seperate requirements dict needs some } # major refactoring of code. # Should the elements of 'replacements' and 'exceptions' be interpreted # as regular expressions? regex = False # Predefined fixes from dictionary 'fixes' (see above). fix = None # the dump's path, either absolute or relative, which will be used # if -xml flag is present xmlFilename = None useSql = False PageTitles = [] # will become True when the user presses a ('yes to all') or uses the # -always flag. acceptall = False # Will become True if the user inputs the commandline parameter -nocase caseInsensitive = False # Will become True if the user inputs the commandline parameter -dotall dotall = False # Will become True if the user inputs the commandline parameter -multiline multiline = False # Do all hits when they overlap allowoverlap = False # Do not recurse replacement recursive = False # This is the maximum number of pages to load per query maxquerysize = 60 # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # Load default summary message. # BUG WARNING: This is probably incompatible with the -lang parameter. editSummary = pywikibot.translate(pywikibot.getSite(), msg) # Between a regex and another (using -fix) sleep some time (not to waste # too much CPU sleep = None # Read commandline parameters. for arg in pywikibot.handleArgs(*args): if arg == '-regex': regex = True elif arg.startswith('-xmlstart'): if len(arg) == 9: xmlStart = pywikibot.input( u'Please enter the dumped article to start with:') else: xmlStart = arg[10:] elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = pywikibot.input( u'Please enter the XML dump\'s filename:') else: xmlFilename = arg[5:] elif arg =='-sql': useSql = True elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append(pywikibot.input( u'Which page do you want to change?')) else: PageTitles.append(arg[6:]) elif arg.startswith('-excepttitle:'): exceptions['title'].append(arg[13:]) elif arg.startswith('-requiretitle:'): exceptions['require-title'].append(arg[14:]) elif arg.startswith('-excepttext:'): exceptions['text-contains'].append(arg[12:]) elif arg.startswith('-exceptinside:'): exceptions['inside'].append(arg[14:]) elif arg.startswith('-exceptinsidetag:'): exceptions['inside-tags'].append(arg[17:]) elif arg.startswith('-fix:'): fix = arg[5:] elif arg.startswith('-sleep:'): sleep = float(arg[7:]) elif arg == '-always': acceptall = True elif arg == '-recursive': recursive = True elif arg == '-nocase': caseInsensitive = True elif arg == '-dotall': dotall = True elif arg == '-multiline': multiline = True elif arg.startswith('-addcat:'): add_cat = arg[8:] elif arg.startswith('-summary:'): editSummary = arg[9:] summary_commandline = True elif arg.startswith('-allowoverlap'): allowoverlap = True elif arg.startswith('-query:'): maxquerysize = int(arg[7:]) else: if not genFactory.handleArg(arg): commandline_replacements.append(arg) if (len(commandline_replacements) % 2): raise pywikibot.Error, 'require even number of replacements.' elif (len(commandline_replacements) == 2 and fix is None): replacements.append((commandline_replacements[0], commandline_replacements[1])) if not summary_commandline: editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')') elif (len(commandline_replacements) > 1): if (fix is None): for i in xrange (0, len(commandline_replacements), 2): replacements.append((commandline_replacements[i], commandline_replacements[i + 1])) if not summary_commandline: pairs = [( commandline_replacements[i], commandline_replacements[i + 1] ) for i in range(0, len(commandline_replacements), 2)] replacementsDescription = '(%s)' % ', '.join( [('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % replacementsDescription else: raise pywikibot.Error( 'Specifying -fix with replacements is undefined') elif fix is None: old = pywikibot.input(u'Please enter the text that should be replaced:') new = pywikibot.input(u'Please enter the new text:') change = '(-' + old + ' +' + new replacements.append((old, new)) while True: old = pywikibot.input( u'Please enter another text that should be replaced, or press Enter to start:') if old == '': change = change + ')' break new = pywikibot.input(u'Please enter the new text:') change = change + ' & -' + old + ' +' + new replacements.append((old, new)) if not summary_commandline: default_summary_message = pywikibot.translate(pywikibot.getSite(), msg) % change pywikibot.output(u'The summary message will default to: %s' % default_summary_message) summary_message = pywikibot.input( u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:') if summary_message == '': summary_message = default_summary_message editSummary = summary_message else: # Perform one of the predefined actions. try: fix = fixes.fixes[fix] except KeyError: pywikibot.output(u'Available predefined fixes are: %s' % fixes.fixes.keys()) return if "regex" in fix: regex = fix['regex'] if "msg" in fix: editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg']) if "exceptions" in fix: exceptions = fix['exceptions'] if "nocase" in fix: caseInsensitive = fix['nocase'] replacements = fix['replacements'] #Set the regular expression flags flags = re.UNICODE if caseInsensitive: flags = flags | re.IGNORECASE if dotall: flags = flags | re.DOTALL if multiline: flags = flags | re.MULTILINE # Pre-compile all regular expressions here to save time later for i in range(len(replacements)): old, new = replacements[i] if not regex: old = re.escape(old) oldR = re.compile(old, flags) replacements[i] = oldR, new for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']: if exceptionCategory in exceptions: patterns = exceptions[exceptionCategory] if not regex: patterns = [re.escape(pattern) for pattern in patterns] patterns = [re.compile(pattern, flags) for pattern in patterns] exceptions[exceptionCategory] = patterns if xmlFilename: try: xmlStart except NameError: xmlStart = None gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart, replacements, exceptions) elif useSql: whereClause = 'WHERE (%s)' % ' OR '.join( ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements]) if exceptions: exceptClause = 'AND NOT (%s)' % ' OR '.join( ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions]) else: exceptClause = '' query = u""" SELECT page_namespace, page_title FROM page JOIN text ON (page_id = old_id) %s %s LIMIT 200""" % (whereClause, exceptClause) gen = pagegenerators.MySQLPageGenerator(query) elif PageTitles: pages = [pywikibot.Page(pywikibot.getSite(), PageTitle) for PageTitle in PageTitles] gen = iter(pages) gen = genFactory.getCombinedGenerator(gen) if not gen: # syntax error, show help text from the top of this file pywikibot.showHelp('replace') return if xmlFilename: # XML parsing can be quite slow, so use smaller batches and # longer lookahead. preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=20, lookahead=100) else: preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize) bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary) bot.run() if __name__ == "__main__": try: main() finally: pywikibot.stopme()