User:TalBot/header check.py
Jump to navigation
Jump to search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Check for pages without a header
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#
# run with standard args "-log -putthrottle:xx"
#
import catlib, pagegenerators, wikipedia
wikipedia.get_throttle.setDelay(5)
for args in wikipedia.handleArgs():
wikipedia.output(u'(WWW) ignoring unrecognised argument: %s' % arg)
# Basic stuff
site = wikipedia.getSite()
namespaces = [ u'', u'Author', u'Help', u'Portal', u'Wikisource' ]
template_names = [ u'Archive header', u'Author', u'EB1911', u'Header', u'Header2', u'Process header' ]
category_names = [ u'Soft redirects', u'Protected deleted pages' ]
# generate page titles
allpages = set()
templaterefs = set()
catpages = set()
for namespace in namespaces:
allpages |= set(page.title() for page in pagegenerators.AllpagesPageGenerator(namespace = site.getNamespaceIndex(namespace), includeredirects = False))
for template in template_names:
templaterefs |= set(page.title() for page in wikipedia.Page(site, u'Template:' + template).getReferences(onlyTemplateInclusion = True))
for category in category_names:
catpages |= set(page.title() for page in catlib.Category(site, u'Category:' + category).articles(recurse = True))
wikipedia.stopme()
#
# List pages with no header
#
noheaders = sorted(allpages - templaterefs - catpages)
wikipedia.output(u'(III) Pages with no headers:')
for title in noheaders:
wikipedia.output(u'* [[%s]]' % title)