User:TalBot/xo pp fix.py


 * 1) ! /usr/bin/env python
 * 2) _*_ coding: utf8 _*_
 * 3) Fix extra stuff before Executive Orders and Presidential Proclamations
 * 4) run with args "-log -putthrottle:xx"
 * 5) Copyright (C) 2007, GrafZahl (en.wikisource.org user)
 * 6) Licence: GPLv2
 * 1) Copyright (C) 2007, GrafZahl (en.wikisource.org user)
 * 2) Licence: GPLv2
 * 1) Licence: GPLv2
 * 1) Licence: GPLv2

import pagegenerators, re, wikipedia

wikipedia.get_throttle.setDelay(5)


 * 1) Handle args

args = wikipedia.handleArgs

for arg in args: wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)


 * 1) Basic text tokens

summ = u'Removing garbage before '


 * 1) Regexes

header_xp = re.compile(r'\{\{\s*[Hh]eader')


 * 1) page generators

xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order') pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation')


 * 1) Procedure to fix extra stuff before header

def fix_stuff_before_header(page): wikipedia.output(u'(III) Checking %s' % page.title) if(page.isRedirectPage): wikipedia.output(u'  (III) Skipping page, redirect') return text = page.get match = header_xp.search(text) if(match == None): wikipedia.output(u'  (III) Skipping page, no header') return newtext = text[match.start:] if newtext != text: wikipedia.output(u'  (III) Removing garbage before header') page.put(newtext, summ, minorEdit = False) return


 * 1) check pages

for page in xo_pages: fix_stuff_before_header(page)

for page in pp_pages: fix_stuff_before_header(page)