User:Kaldari/PageCleanUp.js

/*jshint boss:true*/ /*global $, mw*/

/** * This script adds a toolbar button for cleaning up the OCR text. */

( function ( mw, $ ) {

function cleanUp( text ) { text = text

// Start by trimming leading and trailing whitespace. .trim

// remove trailing spaces at the end of each line .replace(/ +\n/g, '\n')

// remove trailing whitespace preceding a hard line break .replace(/ +/g, ' ')

// remove trailing whitespace and numerals at the end of page text // (numerals are nearly always page numbers in the footer) .replace(/[\s\d]+$/g, '')

// remove trailing spaces at the end of refs .replace(/ +<\/ref>/g, ' ') // remove trailing spaces at the end of template calls .replace(/ +}}/g, '}}') // convert double-hyphen to mdash (avoiding breaking HTML comment syntax) .replace(/([^\!])--([^>])/g, '$1—$2')

// Replace double-em-dash with a two-em bar. .replace(/——/g, '')

// Remove spaces around dashes. .replace( /\s+—\s+/g, '—' ) // Em dash .replace( /\s+–\s+/g, '–' ) // En dash

// remove spacing around mdash, but only if it has spaces on both sides // (we don't want to remove the trailing space from "...as follows:— ",			// bearing in mind that the space will already be gone if at end of line). .replace(/ +— +/g, '—') // join words that are hyphenated across a line break // (but leave "|-" table syntax alone) .replace(/([^\|])-\n/g, '$1');

// clean up pages if they don't have if ( text.indexOf( " " ) === -1 ) { text = text // lines that start with " should probably be new lines,				// if the previous line ends in punctuation,				// other than a comma or semicolon				// and let's get rid of trailing space while we're at it				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')				// lines that end with " should probably precede a new line, // unless preceded by a comma, // or unless the new line starts with a lower-case letter; // and let's get rid of preceding space while we're at it				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2') // remove single line breaks; preserve multiple. // but not if there's a tag, template or table syntax either side of the line break .replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2') // collapse sequences of spaces into a single space .replace(/ +/g, ' '); }		// more page cleanup text = text // dump spurious hard breaks at the end of paragraphs .replace(/\n\n/g, '\n\n')

// remove unwanted spaces around punctuation marks .replace(/ ([;:\?!,])/g, '$1') // unicodify .replace(/&mdash;/g, '—') .replace(/–/g, '–') .replace(/&quot;/g, '"')			// straighten quotes and apostrophes.			.replace(/[“”]/g, '"') .replace(/[‘’`]/g, '\'') //OCR fixes // convert i9 to 19, etc.			.replace(/[il]([0-9])/g, '1$1') // "the", "them", "their", etcetera .replace(/tlie/g, 'the') // "U" -> "ll" when preceded by a lowercase letter. .replace(/([a-z])U/g, '$1ll') // "would", "could" .replace(/woidd/g, 'would') .replace(/coidd/g, 'could') .replace(/shoidd/g, 'should') // many works have apostrophes missing from OCR .replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc .replace(/n t\b/g, 'n\'t') //can't isn't didn't etc .replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc .replace(/\bI m\b/g, 'I\'m') // I'm			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc // expand diacritical templates .replace(//g, '') // replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing .replace(/\{\{float center/g, '{{block center') // Center tags are converted to the {{center}} template. .replace(/ \s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}')

// Full stop followed by a lower case letter should probably be a comma. .replace(/\.(\s[a-z])/g, ',$1')

// Remove unwanted ligatures. .replace(/ﬁ/, 'fi') .replace(/ﬂ/, 'fl')

;		return text; }

function addButton { $( '#wpTextbox1' ).wikiEditor( 'addToToolbar', {			'section': 'main',			'group': 'format',			'tools': {				'smile': {					label: 'Page clean-up', 					type: 'button',					icon: '//upload.wikimedia.org/wikipedia/commons/thumb/1/15/Text-x-generic-apply.svg/22px-Text-x-generic-apply.svg.png',					action: {						type: 'callback',						execute: function {							var newText = cleanUp( $( "#wpTextbox1" ).val );							$( "#wpTextbox1" ).val( newText );						}					}				}			}		} ); };

function main { // Check if we're editing a Page page and that the required modules are available. // Then, customize the toolbar. var isPage = mw.config.get( 'wgCanonicalNamespace' ) === 'Page', isEditing = $.inArray( mw.config.get( 'wgAction' ), [ 'edit', 'submit' ] ) !== -1; if ( isPage && isEditing ) { mw.loader.using( 'user.options', function {				// This can be the string "0" if the user disabled the preference.				if ( mw.user.options.get( 'usebetatoolbar' ) == 1 ) {					var dependencies = [ 'ext.wikiEditor', 'ext.proofreadpage.page.edit' ];					if ( mw.user.options.get( 'codemirror-syntax-highlight' ) == 1 ) {						// CodeMirror doesn't like to be loaded later.						dependencies.push( 'ext.CodeMirror.lib' );					}					mw.loader.using( dependencies, $.ready ).then( addButton );				}			} ); }	}

main; }( mediaWiki, jQuery ) );