User:TalBot/xo pp check.py

From Wikisource
Jump to navigation Jump to search
#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Fix extra stuff before Executive Orders and Presidential Proclamations
#
# run with args "-log -putthrottle:xx"
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#

import pagegenerators, re, wikipedia

wikipedia.get_throttle.setDelay(5)

# Handle args

args = wikipedia.handleArgs()

for arg in args:
        wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)

# Basic text tokens

summ = u'Removing garbage before header'

# Regexes

header_xp = re.compile(r'\{\{\s*[Hh]eader')

# page generators

xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order')
pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation')

# Procedure to check extra stuff before header

def check_stuff_before_header(page):
	wikipedia.output(u'(III) Checking [[%s]]' % page.title())
	if(page.isRedirectPage()):
		wikipedia.output(u'   (XXX) This page is a redirect')
		return
	text = page.get()
	match = header_xp.search(text)
	if(match == None):
		wikipedia.output(u'   (XXX) This page does not have a header')
		return
	wikipedia.output(u'   (XXX) Text before header:\n   %s' % text[:match.start()])
	return

# check pages

for page in xo_pages:
	check_stuff_before_header(page)

for page in pp_pages:
	check_stuff_before_header(page)