User:TalBot/test-rm-soft-redir.py

#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Remove soft redirects for specified months after fixing the pages linking to it
#
# Copyright © 2006—2010, GrafZahl (en.wikisource.org user)
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
#
# Uses some ideas from wikipedia.py by Rob W.W. Hooft, Andre Engels, which is
# distributed under the terms of the MIT licence.
#
# run with standard args "-log -putthrottle:xx"
#
# Further arguments:
#
#	-cat:xxx
#		Specifies the category for which soft redirects should be
#		removed, for example: -cat:'Soft redirects/August 2006'
#		(replace the single quotes with whatever is appropriate for
#		your shell)
#
#	-dumplinks
#		Write all pages linking to a soft redirect page for the given
#		month to a file
#
#	-delete
#		Actually try to delete the pages (assumes sysop privileges!).
#		Otherwise the to-be-deleted page will be logged with
#		[to-be-deleted] prefix.
#
#	-xlink:xxx
#		Specifies a set of pages to be excluded from link correction
#		as a regular expression. For example, to exclude all
#		discussion archives, specify -xlink:'.*/Archive.*' (replace
#		the single quotes with whatever is appropriate for your
#		shell).
#
#	-nopipe:xxx
#		Specifies a set of soft redirects as a regular expression.
#		These redirects will not be added to corrected links as
#		pipes. Pipes that already exist will not be altered.
#
#

import catlib, re, sys, wikipedia

wikipedia.get_throttle.setDelay(5)

# Handle args

args = wikipedia.handleArgs()

month = False
delete = False
dumplinks = False
xlinks = []
nopipe = []

for arg in args:
	if arg[:5] == u'-cat:':
		month = arg[5:]
	elif arg == u'-delete':
		delete = True
	elif arg == u'-dumplinks':
		dumplinks = True
	elif arg[:7] == u'-xlink:':
		try:
			xlinks.append(re.compile(arg[7:]))
		except re.error:
			wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[7:])
	elif arg[:8] == u'-nopipe:':
		try:
			nopipe.append(re.compile(arg[8:]))
		except re.error:
			wikipedia.output(u'(WWW) Ignoring invalid regular expression %s' % arg[8:])
	else:
		wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)

if not month:
	wikipedia.output(u'(FFF) No category given (-cat:xxx)')
	sys.exit(1)

# basic text tokens, etc.

cattitle = u'Category:%s' % month
base_redirover = u'#REDIRECT[[%s]]'
summ = u'[bot] shortcutting redirect(s)'
base_delsumm = u'[bot] deleting old soft redirect to [[%s]]'
comment_re = re.compile(r'(?ms)<!--.*?-->')
inconly_re = re.compile(r'(?ms)<includeonly>.*?</includeonly>')
nowiki_re = re.compile(r'(?ms)<nowiki>.*?</nowiki>')
link_re = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<sectionlink>#[^\]\|]*)?(?P<pipe>\|[^\]]*)?\]\]')

# Function to count instances of a substring in a string, with possible overlap

def count_overlap(string, substring):
	count = 0
	start = string.find(substring) + 1
	while start:
		count += 1
		start = string.find(substring, start) + 1
	return count

def get_page(pagecache, site, title):
	page = wikipedia.Page(site, title)
	try:
		result = pagecache[page.title()]
	except KeyError:
		pagecache[page.title()] = page
		result = pagecache[page.title()]
	return result

# Function to extract all links to a given non-redirect page

def make_search_replace_list(pagetext, pagetitle, oldtitle, newtitle, dontpipe = False):
        """pagetext: Text to be searched
           pagetitle: Title of page to be searched (must not be a redirect page)
           oldtitle: title to be found, generated by wikipedia.Page.title()
           newtitle: New title to link to
           dontpipe: do not add pipe text if pipe is missing

           pagetitle, oldtitle and newtitle should be mutually different

           Returns list of (search, replace) tuples, where replace is, if
           possible, a relative link, if search is a relative link

           Piping:
           - Existing pipes are not altered
           - When no pipe exists, the old link will be used as pipes by
             default
           - When no pipe exists and dontpipe == True, no pipe will be
             inserted
        """
        text = pagetext
        result = []
        # The following code is similar to wikipedia.Page.linkedPages
        ### Kill all comments, nowiki and includeonly
        text = re.sub(comment_re, r'', text)
        text = re.sub(nowiki_re, r'', text)
        text = re.sub(inconly_re, r'', text)
        ### Extract all links
        for match in link_re.finditer(text):
                # Extract title and calculate replacement if it is equivalent to newtitle
                oldlink = match.group(0)
                title = match.group(r'title')
                sectionlink = match.group(r'sectionlink')
                if sectionlink == None:
                        sectionlink = u''
                pipetext = match.group(r'pipe')
                wtitle = title.strip()
		if len(wtitle) == 0: # Internal anchor
			continue
		# Check if the link begins with a colon
		if wtitle[0] == u':':
			colon = u':'
		else:
			colon = u''
                ### Ignore links to another wiki
                if site.isInterwikiLink(wtitle):
                        continue
                ### Handle relative links
                relative = False
                nestlevel = count_overlap(wtitle, u'/../')
                if wtitle.startswith(u'../'):
                        relative = True
                        nestlevel += 1
                if (not wtitle.startswith(u'../' * nestlevel)) or (pagetitle.count(u'/') < nestlevel):
                        # not a valid link
                        continue
                wpagetitle = pagetitle
                ##### Calculate absolute link
                for i in range(nestlevel):
                        wpagetitle = wpagetitle[:wpagetitle.rfind(u'/')]
                        wtitle = wtitle[3:]
                if relative:
                        wtitle = wpagetitle + u'/' + wtitle
                        # If the calculated title ends with /, it is stripped.
                        # Bug in MediaWiki?
                        if wtitle.endswith(u'/'):
                                wtitle = wtitle[:-1]
                if wtitle.startswith(u'/'):
                        wtitle = wpagetitle + wtitle
                        # Also a form of a relative link
                        relative = True
                ### Normalise title
                try:
                        wtitle = wikipedia.Page(site, wtitle).title()
                except wikipedia.Error:
                        # Something wrong with the title
                        wikipedia.output(u'(DDD) Title %s caused exception (pagetitle=%s, oldtitle=%s, newtitle=%s, oldlink=%s, extracted title=%s)' % (wtitle, pagetitle, oldtitle, newtitle, oldlink, title))
                        continue
                if wtitle != oldtitle:
                        # It's some other link
                        continue
                ### Replace link with new link
                wnewtitle = newtitle
                if relative:
			# Make it a relative link
                        ### How many levels are there in total in the page title?
                        totallevels = pagetitle.count(u'/') + 1
                        ### How many levels do the new title and the current page in common?
			##### Check '/' form first, otherwise count matching
			##### initial letters
			if wnewtitle.startswith(pagetitle):
				commonlevels = totallevels
			else:
                        	i = 0
                        	while wnewtitle[i] == pagetitle[i]:
                                	i += 1
				commonlevels = wnewtitle.count(u'/', 0, i + 1)
			### kill common levels from new title and add
			### sufficient "../"
                        for i in range(commonlevels):
                                wnewtitle = wnewtitle[wnewtitle.find(u'/') + 1:]
                        if commonlevels == totallevels:
                                wnewtitle = u'/' + wnewtitle
                        wnewtitle = (u'../' * (totallevels - commonlevels)) + wnewtitle
                if pipetext == None:
                        if dontpipe == False:
                                pipetext = u'|' + title
                        else:
                                pipetext = u''
                newlink = u'[[%s%s%s%s]]' % ( colon, wnewtitle, sectionlink, pipetext )
                result.append((oldlink, newlink))
        return list(set(result))

# Start operation

site = wikipedia.getSite()
cat = catlib.Category(site, cattitle)
articles = list(cat.articles())

# Generate dictionary of texts linking to each soft redirect

pagecache = {}
linksdict = {}
included = set()
excluded = set()

for page in articles:
	refs = page.getReferences()
	linksdict[page.title()] = []
	for ref in refs:
		if ref.title() in excluded:
			continue
		do_include = True
		for xlink in xlinks:
			match = xlink.match(ref.title())
			if (match != None) and (match.group() == ref.title()):
				do_include = False
		if do_include:
			linksdict[page.title()].append(ref.title())
			included.add(ref.title())
		else:
			excluded.add(ref.title())

included = sorted(included)
excluded = sorted(excluded)

wikipedia.output(u'(III) The following pages will be link-corrected:')
for title in included:
	wikipedia.output(u'* [[%s]]' % title)

wikipedia.output(u'(III) The following pages will be EXCLUDED from link correction:')
for title in excluded:
	wikipedia.output(u'* [[%s]]' % title)

# Now check which links are deemed unpipeable

dontpipe = set()

for page in articles:
	for pattern in nopipe:
		match = pattern.match(page.title())
		if (match != None) and (match.group() == page.title()):
			dontpipe.add(page.title())
			break

dontpipe = sorted(dontpipe)

wikipedia.output(u'(III) The following old links will not have pipes added:')
for title in dontpipe:
	wikipedia.output(u'[[%s]]' % title)

# Dump links

if dumplinks:
	while True:
		fname = wikipedia.input(u'File name for list of pages linking to soft redirects?')
		try:
			f = file(fname, u'a')
			if f.tell() != 0:
				wikipedia.output(u'(EEE) File %s already exists. Please choose another file name.' % fname)
				f.close()
			else:
				break
		except IOError:
			wikipedia.output(u'(EEE) IO Error during operation with %s. Please try again or choose another file name.' % fname)
	# Write links in Wiki markup. Exceptions terminate process.
	for title in linksdict.iterkeys():
		for link in linksdict[title]:
			f.write(u'* [[%s]] links to [[%s]]\n' % ( link, title ))
	f.close()
	wikipedia.output(u'Links written to file %s' % fname)

# Correct links without putting the corrected version at first and delete old pages

oldtextdict = {}
changedict = {}
backrefdict = {}
deleteset = set()

for title in linksdict.iterkeys():
	wikipedia.output(u'(III) Calculating updated links to %s' % title)
	softredir = get_page(pagecache, site, title)
	# Check if someone confused soft and hard redirs
	if softredir.isRedirectPage():
		wikipedia.output(u'(EEE) %s is a hard redirect, not a soft one' % title)
		continue
	# Extract new target
	newlist = softredir.linkedPages()
	### There should be only one normal link
	if len(newlist) != 1:
		wikipedia.output(u'(EEE) No unambiguous target for soft redirect %s' % title)
		continue
	new = newlist[0]
	newtitle = new.title()
	# HACK!
	if new.namespace() in (6, 14):
		newtitle = u':' + newtitle
	# End HACK
	redirover = base_redirover % newtitle
	# Correct links for each page individually
	for pagetitle in linksdict[title]:
		page = get_page(pagecache, site, pagetitle)
		# Back link
		if not backrefdict.has_key(pagetitle):
			backrefdict[pagetitle] = []
		backrefdict[pagetitle].append(title)
		# Special treatment for redirect pages
		# These can be fixed immediately because they have exactly one link
		if page.isRedirectPage():
			try:
				wikipedia.output(u'(XXX) Fake put: Overwriting [[%s]] with "%s", summary "%s"' % ( pagetitle, redirover, summ ))
			except wikipedia.Error:
				wikipedia.output(u'(EEE) Unable to edit redirect %s' % pagetitle)
			continue
		# get text tokens to be replaced with new link
		try:
			try:
				text = changedict[pagetitle]
			except KeyError:
				text = page.get()
				oldtextdict[pagetitle] = text
			if title in dontpipe:
				override_pipe = True
			else:
				override_pipe = False
			srlist = make_search_replace_list(text, pagetitle, title, newtitle, override_pipe)
			for sr in srlist:
				wikipedia.output(u'(XXX) Replacing "%s" with "%s" in [[%s]]' % ( sr[0], sr[1], pagetitle ))
				text = wikipedia.replaceExcept(text, re.escape(sr[0]), sr[1], ['comment', 'math', 'nowiki', 'pre'])
			changedict[pagetitle] = text
		except wikipedia.Error:
			wikipedia.output(u'(EEE) Unable to process %s' % pagetitle)
	# Add soft redirect page to the set of to-be-deleted pages
	if delete:
		deleteset.add(title)
	else:
		wikipedia.output(u'(III) [to-be-deleted] %s' % title)

# Now update all links

for title in changedict.iterkeys():
	wikipedia.output('(III) Updating links in page %s' % title)
	# Get current version of page
	page = wikipedia.Page(site, title)
	# Update only if page wasn't edited since
	try:
		# Check if text has changed
		# Comparison of permalinks would be more efficient, but
		# unfortunately, pywikipedia's permalink feature is somewhat
		# broken
		if page.get() == oldtextdict[title]:
			wikipedia.output(u'(XXX) Fake put: Overwriting [[%s]] with summary "%s"' % ( title, summ ))
		else:
			wikipedia.output(u'(EEE) Not updating [[%s]]: Page was edited since' % title)
			# Don't delete soft redirects that still have issues
			for backlink in backrefdict[title]:
				deleteset.discard(backlink.title())
	except wikipedia.Error:
		wikipedia.output('(EEE) Unable to edit [[%s]]' % title)

# Lastly, delete the soft redirects

for title in deleteset:
	try:
		page = get_page(pagecache, site, title)
		delsumm = base_delsumm % page.linkedPages()[0].title()
		wikipedia.output(u'(XXX) Fake delete: [[%s]] with summary "%s"' % ( title, delsumm ))
	except wikipedia.Error:
		wikipedia.output(u'(EEE) Unable to delete %s' % title)
	except IndexError:
		wikipedia.output(u'(EEE) Not deleting %s: Unable to find redirect target' % title)