User:TalBot/xo pp check.py
Jump to navigation
Jump to search
#! /usr/bin/env python # _*_ coding: utf8 _*_ # # Fix extra stuff before Executive Orders and Presidential Proclamations # # run with args "-log -putthrottle:xx" # # Copyright (C) 2007, GrafZahl (en.wikisource.org user) # # Licence: GPLv2 # import pagegenerators, re, wikipedia wikipedia.get_throttle.setDelay(5) # Handle args args = wikipedia.handleArgs() for arg in args: wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg) # Basic text tokens summ = u'Removing garbage before header' # Regexes header_xp = re.compile(r'\{\{\s*[Hh]eader') # page generators xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order') pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation') # Procedure to check extra stuff before header def check_stuff_before_header(page): wikipedia.output(u'(III) Checking [[%s]]' % page.title()) if(page.isRedirectPage()): wikipedia.output(u' (XXX) This page is a redirect') return text = page.get() match = header_xp.search(text) if(match == None): wikipedia.output(u' (XXX) This page does not have a header') return wikipedia.output(u' (XXX) Text before header:\n %s' % text[:match.start()]) return # check pages for page in xo_pages: check_stuff_before_header(page) for page in pp_pages: check_stuff_before_header(page)