User:Inductiveload/Scripts/WP backlink finder
Jump to navigation
Jump to search
This program finds author pages which link to Wikipedia using the "wikipedia" field in the {{author}} template, but whcih do not have a return link from Wikipedia to the author page.
It takes no parameters, the input and output files are specified in the script:
python find_wp_author_without_backlinks.py
Source
[edit]#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# find_wp_author_without_backlinks.py
#
# This scripts finds authors which have a link to wikipedia, wikiquote
# or commons but no return link from that site.
#
# It uses User:Phe's query_ext.py script to find a list of pages in the
# relevant categories
#
# A list of authors and corresponding WP articles without backlinks
# is returned as the output file.
#
import pw_script_header #adds the pywikipedia directory to the python PATH
import wikipedia
import query_ext
import codecs
import re
class WS_External_Backlink_Checker():
def check_for_missing_backlinks(self, wiki):
if wiki == 'wikipedia':
wikiname = 'Wikipedia'
wikiprefix = 'w'
category = 'Category:Author pages linking to Wikipedia'
ext_site = wikipedia.getSite("en", "wikipedia")
header_regex = r"\| *[Ww]ikipedia *= *(.*)"
search_regex = r"{{ *[Ww]ikisource[ -]?author"
elif wiki == 'wikiquote':
wikiname = 'Wikiquote'
wikiprefix = 'q'
category = 'Category:Author pages linking to Wikiquote'
ext_site = wikipedia.getSite("en", "wikiquote")
header_regex = r"\| *[Ww]ikiquote *= *(.*)"
search_regex = r"{{ *[Ww]ikisource[ -]?author"
else:
print '(ERR) Unknown wiki: %s' % wiki
ws_pages_with_links = query_ext.PreloadingCategory(category, recurse = False, filtered_cat = [], site = self.ws_site)
for ws_page in ws_pages_with_links: #for every author page with a WP link
ws_page_title = ws_page['title']
print 'INF: Processing page: %s' % ws_page_title
ws_page = wikipedia.Page(self.ws_site, ws_page_title) # get the page
ws_page_text = ws_page.get() #extract wikitext
m = re.search( header_regex, ws_page_text) #look for the name of the wikipedia article
if not m: #failed to find, skip this one
print "\t(INF) %s article not found" % wikiname
continue
ext_page_title = m.group(1) # this is the WP article name
print "\tINF: Found %s page: %s" % (wikiname, ext_page_title)
ext_page = wikipedia.Page(ext_site, ext_page_title) #get the WP article page
try:
ext_page_text = ext_page.get(get_redirect=True) #extract wikitext
except wikipedia.NoPage:
print "INF: Linked %s page doesn't exist" % wikiname
self.out_filelist.write('* [[%s]] ------> [[%s:%s]] (Non-existent page)\n' % (ext_page_title, wikiprefix, ext_page_title) )
continue
except wikipedia.SectionError:
print "\tINF: Linked %s section doesn't exist" % wikiname
self.out_filelist.write('* [[%s]] ------> [[%s:%s]] (Non-existent section)\n' % (ext_page_title, wikiprefix, ext_page_title) )
continue
m = re.search( search_regex, ext_page_text) # look for a {{wikisource author}} template
if not m: # didn't find it, record the page
print "\tINF: %s --> Wikisource link not found." % wikiname
self.out_filelist.write('* [[%s]] ------> [[%s:%s]]\n' % (ext_page_title, wikiprefix, ext_page_title) )
else:
print "\tINF: %s --> Wikisource link found." % wikiname
def __init__(self):
#deal with output files, change filenames as needed
out_filelistname = '/home/john/src/pw/zz_filelist1.txt'
self.out_filelist = codecs.open(out_filelistname, 'w', 'utf-8')
# set up the sites we will be looking at
self.ws_site = wikipedia.getSite("en", "wikisource")
self.cm_site = wikipedia.getSite("commons", "commons")
self.check_for_missing_backlinks('wikipedia')
self.check_for_missing_backlinks('wikiquote')
return
if __name__ == '__main__':
blc = WS_External_Backlink_Checker()