User:TalBot/replace-link.py
Jump to navigation
Jump to search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Update links in a given list of pages.
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#
# Reuses some stuff from rm-soft-redir.py by the same author.
#
# run with standard args "-log -putthrottle:xx"
#
# Further arguments:
#
# -oldlink:xxx
# Link to be replaced; must be a valid page title.
#
# -newlink:xxx
# Link to replace the old link with
#
# -pagelist:xxx
# File containing a list of newline separated page titles
# in which the replacement should take place. Must be
# UTF-8 encoded.
#
# -summary:xxx
# Edit summary
#
# WARNING: This bot script was written for the English Wikisource, which
# is a UTF-8 wiki. For non UTF-8 wikis you must change the
# explicit UTF-8 conversion below to suit your needs. Remember
# that this is GPL software, so there is NO WARRANTY OF ANY
# KIND, TO THE EXTENT ALLOWED BY APPLICABLE LAW.
#
import re, sys, wikipedia
wikipedia.get_throttle.setDelay(5)
# Handle args
args = wikipedia.handleArgs()
oldtitles = []
newtitles = []
pagefilename = False
summary = False
for arg in args:
if arg.startswith(u'-oldlink:'):
oldtitles.append(arg[9:])
elif arg.startswith(u'-newlink:'):
newtitles.append(arg[9:])
elif arg.startswith(u'-pagelist:'):
pagefilename = arg[10:]
elif arg.startswith(u'-summary:'):
summary = arg[9:]
else:
wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)
if len(oldtitles) == 0:
wikipedia.output(u'(FFF) no old link title given (-oldtitle:xxx)')
sys.exit(1)
if len(newtitles) != len(oldtitles):
wikipedia.output(u'(FFF) You must specify -newtitle:xxx exactly as often as -oldtitle:xxx')
sys.exit(1)
if not pagefilename:
wikipedia.output(u'(FFF) no file with page list given (-pagelist:xxx)')
sys.exit(1)
if not summary:
wikipedia.output(u'(WWW) No edit summary given (-summary:xxx)')
summary = u'(no summary)'
# basic text templates
summ = u'[bot] Automatic link replacement: %s' % summary
comment_re = re.compile(r'(?ms)<!--.*?-->')
inconly_re = re.compile(r'(?ms)<includeonly>.*?</includeonly>')
nowiki_re = re.compile(r'(?ms)<nowiki>.*?</nowiki>')
link_re = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<sectionlink>#[^\]\|]*)?(?P<pipe>\|[^\]]*)?\]\]')
# Function to count instances of a substring in a string, with possible overlap
def count_overlap(string, substring):
count = 0
start = string.find(substring) + 1
while start:
count += 1
start = string.find(substring, start) + 1
return count
# Function to extract all links to a given non-redirect page
def make_search_replace_list(pagetext, pagetitle, oldtitle, newtitle, dontpipe = False):
"""pagetext: Text to be searched
pagetitle: Title of page to be searched (must not be a redirect page)
oldtitle: title to be found, generated by wikipedia.Page.title()
newtitle: New title to link to
dontpipe: do not add pipe text if pipe is missing
pagetitle, oldtitle and newtitle should be mutually different
Returns list of (search, replace) tuples, where replace is, if
possible, a relative link, if search is a relative link
Piping:
- Existing pipes are not altered
- When no pipe exists, the old link will be used as pipes by
default
- When no pipe exists and dontpipe == True, no pipe will be
inserted
"""
text = pagetext
result = []
# The following code is similar to wikipedia.Page.linkedPages
### Kill all comments, nowiki and includeonly
text = re.sub(comment_re, r'', text)
text = re.sub(nowiki_re, r'', text)
text = re.sub(inconly_re, r'', text)
### Extract all links
for match in link_re.finditer(text):
# Extract title and calculate replacement if it is equivalent to newtitle
oldlink = match.group(0)
title = match.group(r'title')
# Check if the link begins with a colon
sectionlink = match.group(r'sectionlink')
if sectionlink == None:
sectionlink = u''
pipetext = match.group(r'pipe')
wtitle = title.strip()
if len(wtitle) > 0 and wtitle[0] == u':':
colon = u':'
else:
colon = u''
### Ignore links to another wiki
if site.isInterwikiLink(wtitle):
continue
### Handle relative links
relative = False
nestlevel = count_overlap(wtitle, u'/../')
if wtitle.startswith(u'../'):
relative = True
nestlevel += 1
if not wtitle.startswith(u'../' * nestlevel) or pagetitle.count(u'/') < nestlevel:
# not a valid link
continue
wpagetitle = pagetitle
##### Calculate absolute link
for i in range(nestlevel):
wpagetitle = wpagetitle[:wpagetitle.rfind(u'/')]
wtitle = wtitle[3:]
if relative:
wtitle = wpagetitle + u'/' + wtitle
# If the calculated title ends with /, it is stripped.
# Bug in MediaWiki?
if wtitle.endswith(u'/'):
wtitle = wtitle[:-1]
if wtitle.startswith(u'/'):
wtitle = wpagetitle + wtitle
# Also a form of a relative link
relative = True
### Normalise title
try:
wtitle = wikipedia.Page(site, wtitle).title()
except wikipedia.Error:
# Something wrong with the title
wikipedia.output(u'(DDD) Title %s caused exception' % wtitle)
continue
if wtitle != oldtitle:
# It's some other link
continue
### Replace link with new link
wnewtitle = newtitle
if relative:
# Make it a relative link
### How many levels do the new title and the current page in common?
i = 0
while wnewtitle[i] == pagetitle[i]:
i += 1
commonlevels = wnewtitle.count(u'/', 0, i)
### How many levels are there in total in the page title?
totallevels = pagetitle.count(u'/')
### kill common levels from new title and add sufficient "../"
for i in range(commonlevels):
wnewtitle = wnewtitle[wnewtitle.find(u'/') + 1:]
if commonlevels == totallevels:
wnewtitle = u'/' + wnewtitle
wnewtitle = (u'../' * (totallevels - commonlevels)) + wnewtitle
if pipetext == None:
if dontpipe == False:
pipetext = u'|' + title
else:
pipetext = u''
newlink = u'[[%s%s%s%s]]' % ( colon, wnewtitle, sectionlink, pipetext )
result.append((oldlink, newlink))
return list(set(result))
# Start operation
site = wikipedia.getSite()
try:
oldpages = [ wikipedia.Page(site, oldtitle) for oldtitle in oldtitles ]
newpages = [ wikipedia.Page(site, newtitle) for newtitle in newtitles ]
except wikipedia.Error:
wikipedia.output(u'(FFF) invalid titles')
sys.exit(1)
# Load page list
pagelist = []
try:
pagefile = file(pagefilename, r'r')
except IOError:
wikipedia.output(u'(FFF) Unable to load page file "%s"' % pagefilename)
sys.exit(1)
try:
for line in pagefile:
line = line.rstrip()
try:
page = wikipedia.Page(site, unicode(line, 'UTF-8'))
if page.exists():
pagelist.append(page)
else:
wikipedia.output(u'(EEE) Page [[%s]] does not exist' % line)
except wikipedia.Error:
wikipedia.output(u'(EEE) Error with page [[%s]], ignoring this page' % line)
except IOError:
wikipedia.output(u'(EEE) IO-Error reading from file "%s", ignoring remaining lines' % pagefilename)
try:
pagefile.close()
except IOError:
wikipedia.output(u'(EEE) IO-error closing file')
# Generate new text
oldtextdict = {}
changedict = {}
for page in pagelist:
wikipedia.output(u'(III) Calculating updated links in [[%s]]' % page.title())
try:
try:
text = changedict[page.title()]
except KeyError:
text = page.get()
oldtextdict[page.title()] = text
srlist = []
for i in range(len(oldpages)):
srlist += make_search_replace_list(text, page.title(), oldpages[i].title(), newpages[i].title(), True)
for sr in srlist:
text = wikipedia.replaceExcept(text, re.escape(sr[0]), sr[1], ['comment', 'math', 'nowiki', 'pre'])
changedict[page.title()] = text
except wikipedia.Error:
wikipedia.output(u'(EEE) Unable to process %s' % page.title())
# Now update all links
for title in changedict:
wikipedia.output('(III) Updating links in page %s' % title)
# Get current version of page
page = wikipedia.Page(site, title)
# Update only if page wasn't edited since
try:
# Check if text has changed
# Comparison of permalinks would be more efficient, but
# unfortunately, pywikipedia's permalink feature is somewhat
# broken
if page.get() == oldtextdict[title]:
page.put(changedict[title], summ)
else:
wikipedia.output(u'(EEE) Not updating [[%s]]: Page was edited since' % title)
except wikipedia.Error:
wikipedia.output('(EEE) Unable to edit [[%s]]' % title)