User:TalBot/xo pp check.py


 * 1) ! /usr/bin/env python
 * 2) _*_ coding: utf8 _*_
 * 3) Fix extra stuff before Executive Orders and Presidential Proclamations
 * 4) run with args "-log -putthrottle:xx"
 * 5) Copyright (C) 2007, GrafZahl (en.wikisource.org user)
 * 6) Licence: GPLv2
 * 1) Copyright (C) 2007, GrafZahl (en.wikisource.org user)
 * 2) Licence: GPLv2
 * 1) Licence: GPLv2
 * 1) Licence: GPLv2

import pagegenerators, re, wikipedia

wikipedia.get_throttle.setDelay(5)


 * 1) Handle args

args = wikipedia.handleArgs

for arg in args: wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)


 * 1) Basic text tokens

summ = u'Removing garbage before header'


 * 1) Regexes

header_xp = re.compile(r'\{\{\s*[Hh]eader')


 * 1) page generators

xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order') pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation')


 * 1) Procedure to check extra stuff before header

def check_stuff_before_header(page): wikipedia.output(u'(III) Checking %s' % page.title) if(page.isRedirectPage): wikipedia.output(u'  (XXX) This page is a redirect') return text = page.get match = header_xp.search(text) if(match == None): wikipedia.output(u'  (XXX) This page does not have a header') return wikipedia.output(u'  (XXX) Text before header:\n   %s' % text[:match.start]) return


 * 1) check pages

for page in xo_pages: check_stuff_before_header(page)

for page in pp_pages: check_stuff_before_header(page)