User:Samwilson/PageCleanUp.js

/*jshint boss:true*/ /*global $, mw*/

/** * This script adds a toolbar button for cleaning up the OCR text. */

( function ( mw, $ ) {

function cleanUp( text ) { text = text

// Start by trimming leading and trailing whitespace. .trim

// remove trailing spaces at the end of each line .replace(/ +\n/g, '\n')

// remove trailing whitespace preceding a hard line break .replace(/ +/g, ' ')

// remove trailing whitespace and numerals at the end of page text // (numerals are nearly always page numbers in the footer) .replace(/[\s\d]+$/g, '')

// remove trailing spaces at the end of refs .replace(/ +<\/ref>/g, ' ') // remove trailing spaces at the end of template calls .replace(/ +}}/g, '}}') // convert double-hyphen to mdash (avoiding breaking HTML comment syntax) .replace(/([^\!])--([^>])/g, '$1—$2')

// Replace double-em-dash with a two-em bar. .replace(/——/g, '')

// Remove spaces around dashes. .replace( /\s+—\s+/g, '—' ) // Em dash .replace( /\s+–\s+/g, '–' ) // En dash

// remove spacing around mdash, but only if it has spaces on both sides // (we don't want to remove the trailing space from "...as follows:— ",			// bearing in mind that the space will already be gone if at end of line). .replace(/ +— +/g, '—') // join words that are hyphenated across a line break // (but leave "|-" table syntax alone) .replace(/([^\|])-\n/g, '$1');

// clean up pages if they don't have if ( text.indexOf( " " ) === -1 ) { text = text // lines that start with " should probably be new lines,				// if the previous line ends in punctuation,				// other than a comma or semicolon				// and let's get rid of trailing space while we're at it				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')				// lines that end with " should probably precede a new line, // unless preceded by a comma, // or unless the new line starts with a lower-case letter; // and let's get rid of preceding space while we're at it				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2') // remove single line breaks; preserve multiple. // but not if there's a tag, template or table syntax either side of the line break .replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2') // collapse sequences of spaces into a single space .replace(/ +/g, ' '); }		// more page cleanup text = text // dump spurious hard breaks at the end of paragraphs .replace(/\n\n/g, '\n\n')

// remove unwanted spaces around punctuation marks .replace(/ ([;:\?!,])/g, '$1') // unicodify .replace(/&mdash;/g, '—') .replace(/–/g, '–') .replace(/&quot;/g, '"')			// straighten quotes and apostrophes.			.replace(/[“”]/g, '"') .replace(/[‘’`]/g, '\'') //OCR fixes // convert i9 to 19, etc.			.replace(/[il]([0-9])/g, '1$1') // "the", "them", "their", etcetera .replace(/tlie/g, 'the') // "U" -> "ll" when preceded by a lowercase letter. .replace(/([a-z])U/g, '$1ll') // "would", "could" .replace(/woidd/g, 'would') .replace(/coidd/g, 'could') .replace(/shoidd/g, 'should') // many works have apostrophes missing from OCR .replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc .replace(/n t\b/g, 'n\'t') //can't isn't didn't etc .replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc .replace(/\bI m\b/g, 'I\'m') // I'm			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc // expand diacritical templates .replace(//g, '') // replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing .replace(/\{\{float center/g, '{{block center') // Center tags are converted to the {{center}} template. .replace(/ \s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}')

// Full stop followed by a lower case letter should probably be a comma. .replace(/\.(\s[a-z])/g, ',$1')

// Remove unwanted ligatures. .replace(/ﬁ/, 'fi') .replace(/ﬂ/, 'fl')

;		return text; }

mw.hook( 'wikiEditor.toolbarReady' ).add( function ( $textarea ) {		$textarea.wikiEditor( 'addToToolbar', { section: 'main', group: 'format', tools: { 'Samwilson-PageCleanUp': { label: 'Page clean-up', type: 'button', icon: 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Text-x-generic-apply.svg/22px-Text-x-generic-apply.svg.png', action: { type: 'callback', execute: function { $textarea.val( cleanUp( $textarea.val ) ); }					}				}			}		} );	} );

}( mediaWiki, jQuery ) );