User:TalBot/xo pp fix.py
Jump to navigation
Jump to search
#! /usr/bin/env python # _*_ coding: utf8 _*_ # # Fix extra stuff before Executive Orders and Presidential Proclamations # # run with args "-log -putthrottle:xx" # # Copyright (C) 2007, GrafZahl (en.wikisource.org user) # # Licence: GPLv2 # import pagegenerators, re, wikipedia wikipedia.get_throttle.setDelay(5) # Handle args args = wikipedia.handleArgs() for arg in args: wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg) # Basic text tokens summ = u'Removing garbage before {{header}}' # Regexes header_xp = re.compile(r'\{\{\s*[Hh]eader') # page generators xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order') pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation') # Procedure to fix extra stuff before header def fix_stuff_before_header(page): wikipedia.output(u'(III) Checking [[%s]]' % page.title()) if(page.isRedirectPage()): wikipedia.output(u' (III) Skipping page, redirect') return text = page.get() match = header_xp.search(text) if(match == None): wikipedia.output(u' (III) Skipping page, no header') return newtext = text[match.start():] if newtext != text: wikipedia.output(u' (III) Removing garbage before header') page.put(newtext, summ, minorEdit = False) return # check pages for page in xo_pages: fix_stuff_before_header(page) for page in pp_pages: fix_stuff_before_header(page)