User:TalBot/test-rm-soft-redir.py
Appearance
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Remove soft redirects for specified months after fixing the pages linking to it
#
# Copyright © 2006—2010, GrafZahl (en.wikisource.org user)
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
#
# Uses some ideas from wikipedia.py by Rob W.W. Hooft, Andre Engels, which is
# distributed under the terms of the MIT licence.
#
# run with standard args "-log -putthrottle:xx"
#
# Further arguments:
#
# -cat:xxx
# Specifies the category for which soft redirects should be
# removed, for example: -cat:'Soft redirects/August 2006'
# (replace the single quotes with whatever is appropriate for
# your shell)
#
# -dumplinks
# Write all pages linking to a soft redirect page for the given
# month to a file
#
# -delete
# Actually try to delete the pages (assumes sysop privileges!).
# Otherwise the to-be-deleted page will be logged with
# [to-be-deleted] prefix.
#
# -xlink:xxx
# Specifies a set of pages to be excluded from link correction
# as a regular expression. For example, to exclude all
# discussion archives, specify -xlink:'.*/Archive.*' (replace
# the single quotes with whatever is appropriate for your
# shell).
#
# -nopipe:xxx
# Specifies a set of soft redirects as a regular expression.
# These redirects will not be added to corrected links as
# pipes. Pipes that already exist will not be altered.
#
#
import catlib, re, sys, wikipedia
wikipedia.get_throttle.setDelay(5)
# Handle args
args = wikipedia.handleArgs()
month = False
delete = False
dumplinks = False
xlinks = []
nopipe = []
for arg in args:
if arg[:5] == u'-cat:':
month = arg[5:]
elif arg == u'-delete':
delete = True
elif arg == u'-dumplinks':
dumplinks = True
elif arg[:7] == u'-xlink:':
try:
xlinks.append(re.compile(arg[7:]))
except re.error:
wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[7:])
elif arg[:8] == u'-nopipe:':
try:
nopipe.append(re.compile(arg[8:]))
except re.error:
wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[8:])
else:
wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)
if not month:
wikipedia.output(u'(FFF) No category given (-cat:xxx)')
sys.exit(1)
# basic text tokens, etc.
cattitle = u'Category:%s' % month
base_redirover = u'#REDIRECT[[%s]]'
summ = u'[bot] shortcutting redirect(s)'
base_delsumm = u'[bot] deleting old soft redirect to [[%s]]'
comment_re = re.compile(r'(?ms)<!--.*?-->')
inconly_re = re.compile(r'(?ms)<includeonly>.*?</includeonly>')
nowiki_re = re.compile(r'(?ms)<nowiki>.*?</nowiki>')
link_re = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<sectionlink>#[^\]\|]*)?(?P<pipe>\|[^\]]*)?\]\]')
# Function to count instances of a substring in a string, with possible overlap
def count_overlap(string, substring):
count = 0
start = string.find(substring) + 1
while start:
count += 1
start = string.find(substring, start) + 1
return count
def get_page(pagecache, site, title):
page = wikipedia.Page(site, title)
try:
result = pagecache[page.title()]
except KeyError:
pagecache[page.title()] = page
result = pagecache[page.title()]
return result
# Function to extract all links to a given non-redirect page
def make_search_replace_list(pagetext, pagetitle, oldtitle, newtitle, dontpipe = False):
"""pagetext: Text to be searched
pagetitle: Title of page to be searched (must not be a redirect page)
oldtitle: title to be found, generated by wikipedia.Page.title()
newtitle: New title to link to
dontpipe: do not add pipe text if pipe is missing
pagetitle, oldtitle and newtitle should be mutually different
Returns list of (search, replace) tuples, where replace is, if
possible, a relative link, if search is a relative link
Piping:
- Existing pipes are not altered
- When no pipe exists, the old link will be used as pipes by
default
- When no pipe exists and dontpipe == True, no pipe will be
inserted
"""
text = pagetext
result = []
# The following code is similar to wikipedia.Page.linkedPages
### Kill all comments, nowiki and includeonly
text = re.sub(comment_re, r'', text)
text = re.sub(nowiki_re, r'', text)
text = re.sub(inconly_re, r'', text)
### Extract all links
for match in link_re.finditer(text):
# Extract title and calculate replacement if it is equivalent to newtitle
oldlink = match.group(0)
title = match.group(r'title')
sectionlink = match.group(r'sectionlink')
if sectionlink == None:
sectionlink = u''
pipetext = match.group(r'pipe')
wtitle = title.strip()
if len(wtitle) == 0: # Internal anchor
continue
# Check if the link begins with a colon
if wtitle[0] == u':':
colon = u':'
else:
colon = u''
### Ignore links to another wiki
if site.isInterwikiLink(wtitle):
continue
### Handle relative links
relative = False
nestlevel = count_overlap(wtitle, u'/../')
if wtitle.startswith(u'../'):
relative = True
nestlevel += 1
if (not wtitle.startswith(u'../' * nestlevel)) or (pagetitle.count(u'/') < nestlevel):
# not a valid link
continue
wpagetitle = pagetitle
##### Calculate absolute link
for i in range(nestlevel):
wpagetitle = wpagetitle[:wpagetitle.rfind(u'/')]
wtitle = wtitle[3:]
if relative:
wtitle = wpagetitle + u'/' + wtitle
# If the calculated title ends with /, it is stripped.
# Bug in MediaWiki?
if wtitle.endswith(u'/'):
wtitle = wtitle[:-1]
if wtitle.startswith(u'/'):
wtitle = wpagetitle + wtitle
# Also a form of a relative link
relative = True
### Normalise title
try:
wtitle = wikipedia.Page(site, wtitle).title()
except wikipedia.Error:
# Something wrong with the title
wikipedia.output(u'(DDD) Title %s caused exception (pagetitle=%s, oldtitle=%s, newtitle=%s, oldlink=%s, extracted title=%s)' % (wtitle, pagetitle, oldtitle, newtitle, oldlink, title))
continue
if wtitle != oldtitle:
# It's some other link
continue
### Replace link with new link
wnewtitle = newtitle
if relative:
# Make it a relative link
### How many levels are there in total in the page title?
totallevels = pagetitle.count(u'/') + 1
### How many levels do the new title and the current page in common?
##### Check '/' form first, otherwise count matching
##### initial letters
if wnewtitle.startswith(pagetitle):
commonlevels = totallevels
else:
i = 0
while wnewtitle[i] == pagetitle[i]:
i += 1
commonlevels = wnewtitle.count(u'/', 0, i + 1)
### kill common levels from new title and add
### sufficient "../"
for i in range(commonlevels):
wnewtitle = wnewtitle[wnewtitle.find(u'/') + 1:]
if commonlevels == totallevels:
wnewtitle = u'/' + wnewtitle
wnewtitle = (u'../' * (totallevels - commonlevels)) + wnewtitle
if pipetext == None:
if dontpipe == False:
pipetext = u'|' + title
else:
pipetext = u''
newlink = u'[[%s%s%s%s]]' % ( colon, wnewtitle, sectionlink, pipetext )
result.append((oldlink, newlink))
return list(set(result))
# Start operation
site = wikipedia.getSite()
cat = catlib.Category(site, cattitle)
articles = list(cat.articles())
# Generate dictionary of texts linking to each soft redirect
pagecache = {}
linksdict = {}
included = set()
excluded = set()
for page in articles:
refs = page.getReferences()
linksdict[page.title()] = []
for ref in refs:
if ref.title() in excluded:
continue
do_include = True
for xlink in xlinks:
match = xlink.match(ref.title())
if (match != None) and (match.group() == ref.title()):
do_include = False
if do_include:
linksdict[page.title()].append(ref.title())
included.add(ref.title())
else:
excluded.add(ref.title())
included = sorted(included)
excluded = sorted(excluded)
wikipedia.output(u'(III) The following pages will be link-corrected:')
for title in included:
wikipedia.output(u'* [[%s]]' % title)
wikipedia.output(u'(III) The following pages will be EXCLUDED from link correction:')
for title in excluded:
wikipedia.output(u'* [[%s]]' % title)
# Now check which links are deemed unpipeable
dontpipe = set()
for page in articles:
for pattern in nopipe:
match = pattern.match(page.title())
if (match != None) and (match.group() == page.title()):
dontpipe.add(page.title())
break
dontpipe = sorted(dontpipe)
wikipedia.output(u'(III) The following old links will not have pipes added:')
for title in dontpipe:
wikipedia.output(u'[[%s]]' % title)
# Dump links
if dumplinks:
while True:
fname = wikipedia.input(u'File name for list of pages linking to soft redirects?')
try:
f = file(fname, u'a')
if f.tell() != 0:
wikipedia.output(u'(EEE) File %s already exists. Please choose another file name.' % fname)
f.close()
else:
break
except IOError:
wikipedia.output(u'(EEE) IO Error during operation with %s. Please try again or choose another file name.' % fname)
# Write links in Wiki markup. Exceptions terminate process.
for title in linksdict.iterkeys():
for link in linksdict[title]:
f.write(u'* [[%s]] links to [[%s]]\n' % ( link, title ))
f.close()
wikipedia.output(u'Links written to file %s' % fname)
# Correct links without putting the corrected version at first and delete old pages
oldtextdict = {}
changedict = {}
backrefdict = {}
deleteset = set()
for title in linksdict.iterkeys():
wikipedia.output(u'(III) Calculating updated links to %s' % title)
softredir = get_page(pagecache, site, title)
# Check if someone confused soft and hard redirs
if softredir.isRedirectPage():
wikipedia.output(u'(EEE) %s is a hard redirect, not a soft one' % title)
continue
# Extract new target
newlist = softredir.linkedPages()
### There should be only one normal link
if len(newlist) != 1:
wikipedia.output(u'(EEE) No unambiguous target for soft redirect %s' % title)
continue
new = newlist[0]
newtitle = new.title()
# HACK!
if new.namespace() in (6, 14):
newtitle = u':' + newtitle
# End HACK
redirover = base_redirover % newtitle
# Correct links for each page individually
for pagetitle in linksdict[title]:
page = get_page(pagecache, site, pagetitle)
# Back link
if not backrefdict.has_key(pagetitle):
backrefdict[pagetitle] = []
backrefdict[pagetitle].append(title)
# Special treatment for redirect pages
# These can be fixed immediately because they have exactly one link
if page.isRedirectPage():
try:
wikipedia.output(u'(XXX) Fake put: Overwriting [[%s]] with "%s", summary "%s"' % ( pagetitle, redirover, summ ))
except wikipedia.Error:
wikipedia.output(u'(EEE) Unable to edit redirect %s' % pagetitle)
continue
# get text tokens to be replaced with new link
try:
try:
text = changedict[pagetitle]
except KeyError:
text = page.get()
oldtextdict[pagetitle] = text
if title in dontpipe:
override_pipe = True
else:
override_pipe = False
srlist = make_search_replace_list(text, pagetitle, title, newtitle, override_pipe)
for sr in srlist:
wikipedia.output(u'(XXX) Replacing "%s" with "%s" in [[%s]]' % ( sr[0], sr[1], pagetitle ))
text = wikipedia.replaceExcept(text, re.escape(sr[0]), sr[1], ['comment', 'math', 'nowiki', 'pre'])
changedict[pagetitle] = text
except wikipedia.Error:
wikipedia.output(u'(EEE) Unable to process %s' % pagetitle)
# Add soft redirect page to the set of to-be-deleted pages
if delete:
deleteset.add(title)
else:
wikipedia.output(u'(III) [to-be-deleted] %s' % title)
# Now update all links
for title in changedict.iterkeys():
wikipedia.output('(III) Updating links in page %s' % title)
# Get current version of page
page = wikipedia.Page(site, title)
# Update only if page wasn't edited since
try:
# Check if text has changed
# Comparison of permalinks would be more efficient, but
# unfortunately, pywikipedia's permalink feature is somewhat
# broken
if page.get() == oldtextdict[title]:
wikipedia.output(u'(XXX) Fake put: Overwriting [[%s]] with summary "%s"' % ( title, summ ))
else:
wikipedia.output(u'(EEE) Not updating [[%s]]: Page was edited since' % title)
# Don't delete soft redirects that still have issues
for backlink in backrefdict[title]:
deleteset.discard(backlink.title())
except wikipedia.Error:
wikipedia.output('(EEE) Unable to edit [[%s]]' % title)
# Lastly, delete the soft redirects
for title in deleteset:
try:
page = get_page(pagecache, site, title)
delsumm = base_delsumm % page.linkedPages()[0].title()
wikipedia.output(u'(XXX) Fake delete: [[%s]] with summary "%s"' % ( title, delsumm ))
except wikipedia.Error:
wikipedia.output(u'(EEE) Unable to delete %s' % title)
except IndexError:
wikipedia.output(u'(EEE) Not deleting %s: Unable to find redirect target' % title)