User:Beleg Tâl/proofreading.js

/*

This page defines a TemplateScript library. It's not meant to be referenced directly. See TemplateScript for usage.

I've made some changes to it, in particular:
 * Uses HTML5 line breaks instead of XHTML line breaks

instead of removing them
 * Has an alternative "clean up OCR" method for poetry, which hard-codes line breaks

/* global $, pathoschild */

/** * TemplateScript adds configurable templates and scripts to the sidebar, and adds an example regex editor. * @see https://meta.wikimedia.org/wiki/TemplateScript * @update-token */ // $.ajax('//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', { dataType:'script', cache:true }).then(function {	/*********	** Define library	*********/	pathoschild.TemplateScript.library.define({ key: 'wikisource.proofreading', name: 'Proofreading tools', url: '//en.wikisource.org/wiki/Wikisource:TemplateScript#Proofreading', description: 'A set of scripts for proofreading works in the Page: namespace. This includes tools for cleaning up OCR, generating page templates, and adding common text formatting.', categories: [ {				name: 'Page tools', scripts: [ { key: 'add-header', name: 'Add header', script: function(editor) { addPageHeader(editor); }, forNamespaces: 'page' }, { key: 'add-footer', name: 'Add footer', script: function(editor) { addPageFooter(editor); }, forNamespaces: 'page' }, { key: 'cleanup-ocr', name: 'Clean up OCR', script: function(editor) { pageCleanup(editor); }, forNamespaces: 'page' }, { key: 'cleanup-ocr2', name: 'Clean up OCR poem', script: function(editor) { poetryCleanup(editor); }, forNamespaces: 'page' }, { key: 'make-ref', name: 'Make reference', script: function(editor) { makeReference(editor); }, forNamespaces: 'page' }, { key: 'smallcaps', name: 'Convert to small-caps', script: function(editor) { smallcaps(editor); }, forNamespaces: 'page' }, { key: 'uppercase', name: 'Convert to uppercase', script: function(editor) { upper(editor); }, forNamespaces: 'page' } ]			}		]	});

/*********	** Page context *********/	var state = { initialised: false, // whether the page context has been initialised page: { number: null,  // the djvu page number extracted from the URL proofed: null },		specialFormats: [] // work-specific header template formats };

/*********	** Private methods *********/	/**	 * Initialise the data needed by the page tools. */	var _initialise = function { // only initialise once if(state.initialised) return; state.initialised = true;

// get page metadata var pn = /\.djvu\/([0-9]+)&action=edit/g.exec(location.href); var pq = document.getElementById('pagequality'); state.page = { number: pn !== null ? parseInt(pn[1], 10) : null, proofed: pq && pq.getAttribute('class') && pq.getAttribute('class').match(/quality0|quality[2-4]/) };

// get user-defined work formats // expected format: //  {		//      title: /History of England /, //     evenHeader: '...', //     oddHeader: '...', //     footer: '', //     footerWithReferences: '' //  }		state.specialFormats = []; if(window.specialFormats) state.specialFormats = state.specialFormats.concat(window.specialFormats); };	/**	 * Convert the text to title case based on English rules. * @param {string} text The text to convert. */	var _titlecase = function(text) { // split text into individual words and examine them one by one var words = text.toLowerCase.split(" "); $.each(function(i, word) {			switch(word) {				case "a":				case "an":				case "and":				case "as":				case "at":				case "but":				case "by":				case "etcetera":				case "etc.":				case "for":				case "from":				case "in":				case "nor":				case "of":				case "o'":				case "on":				case "or":				case "the":				case "to":				case "with":				case "versus":				case "vs.":				case "v.":				case "yet":					break; // don't capitalise articles, "to" as part of an infinitive, prepositions or short conjunctions				default: // capitalise everything else					words[i] = word.substring(0, 1).toUpperCase + word.substring(1, words[i].length);					break;			}		});

// capitalise first word regardless words[0] = words[0].substring(0, 1).toUpperCase + words[0].substring(1, words[0].length);

// capitalise last word regardless var last = words.length-1; words[last] = words[last].substring(0, 1).toUpperCase + words[last].substring(1, words[last].length);

// reconstruct title return words.join(' '); };

/*********	** Script methods *********/	/**	 * Add a template to the page. * @param {object} editor The script helpers for the page. */	var addPageHeader = function(editor) { _initialise; if(state.page.number === null) return;

var isEven = (state.page.number % 2 === 0); var generic = true; var headertext = ''; for (var f in state.specialFormats) { var format = state.specialFormats[f]; if (mw.config.get('wgTitle').match(format.title)) { headertext = isEven ? format.evenHeader : format.oddHeader; generic = false; break; }		}

// no special header matched, use a generic running header if (generic) { if (isEven) headertext = ''; // assume verso, with page number at left else headertext = ''; }		$('#wpHeaderTextbox').val(function(i, val) {			return $.trim(val + '\n' + headertext);		});

// if this is unproofed text, then delete the first line of the OCR text, which presumably is raw OCR of the header we've just inserted if (!state.page.proofed) { var text = editor.get; editor.set(text.slice(text.indexOf('\n') + 1)); }	};

/**	 * Clean up OCR errors in the text, and push content at the top * & bottom of the page into the header & footer boxes respectively. * @param {object} editor The script helpers for the page. */	var poetryCleanup = function(editor) { _initialise; // push content at the top & bottom into the header & footer if (editor.get.match(/^/)) { var text = editor.get; var e = text.indexOf(" "); $('#wpHeaderTextbox').val(function(i, val) {				return $.trim(val + "\n" + text.substr(11, e-11).replace(/^\s+|\s+$/g, ''));			}); editor.set(text.substr(e+12)); }		if (editor.get.match(/<\/noinclude\>$/)) { var text = editor.get; var s = text.lastIndexOf(" "); $('#wpFooterTextbox').val(function(i, val) {				return $.trim(text.substr(s+11, text.length-s-11-12).replace(/^\s+|\s+$/g, '') + "\n" + val);			}); editor.set(text.substr(0, s)); }		// clean up text editor // put a hard line break at the end of every line .replace(/ *\n/g, ' \n')

// replace xhtml hard line break with html hard line break .replace(//g, ' ')

// remove trailing whitespace and numerals at the end of page text // (numerals are nearly always page numbers in the footer) .replace(/[\s\d]+$/g, '')

// remove trailing spaces at the end of refs .replace(/ +<\/ref>/g, ' ') // remove trailing spaces at the end of template calls .replace(/ +}}/g, '}}') // convert double-hyphen to mdash (avoiding breaking HTML comment syntax) .replace(/([^\!])--([^>])/g, '$1—$2') // remove spacing around mdash, but only if it has spaces on both sides // (we don't want to remove the trailing space from "...as follows:— ",			// bearing in mind that the space will already be gone if at end of line). .replace(/ +— +/g, '—')

// clean up pages if they don't have if (!editor.contains(' ')) { editor // lines that start with " let's get rid of trailing space				.replace(/\n\") */g, '\n"')				// lines that end with " let's get rid of preceding space 				.replace(/ *(\"\n)/g, '$1')				// collapse sequences of spaces into a single space				.replace(/ +/g, ' ');		}		// more page cleanup		editor			// dump spurious hard breaks at the end of paragraphs			.replace(/\n\n/g, '\n\n')

// remove unwanted spaces around punctuation marks .replace(/ ([;:\?!,])/g, '$1') // unicodify .replace(/&mdash;/g, '—') .replace(/–/g, '–') .replace(/&quot;/g, '"')			// straighten quotes and apostrophes.			.replace(/[“”]/g, '"') .replace(/[‘’`]/g, '\'') //OCR fixes // convert i9 to 19, etc.			.replace(/[il]([0-9])/g, '1$1') // "the", "them", "their", etcetera .replace(/tlie/g, 'the') // "U" -> "ll" when preceded by a lowercase letter. .replace(/([a-z])U/g, '$1ll') // "would", "could" .replace(/woidd/g, 'would') .replace(/coidd/g, 'could') .replace(/shoidd/g, 'should') // many works have apostrophes missing from OCR .replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc .replace(/n t\b/g, 'n\'t') //can't isn't didn't etc .replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc .replace(/\bI m\b/g, 'I\'m') // I'm			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc // expand diacritical templates .replace(//g, '') // replace "float center" and "block center" with "center block" .replace(/\{\{(float|block) center/g, '{{center block') .replace(/ \s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}'); };	/**	 * Clean up OCR errors in the text, and push content at the top * & bottom of the page into the header & footer boxes respectively. * @param {object} editor The script helpers for the page. */	var pageCleanup = function(editor) { _initialise; // push content at the top & bottom into the header & footer if (editor.get.match(/^/)) { var text = editor.get; var e = text.indexOf(" "); $('#wpHeaderTextbox').val(function(i, val) {				return $.trim(val + "\n" + text.substr(11, e-11).replace(/^\s+|\s+$/g, ''));			}); editor.set(text.substr(e+12)); }		if (editor.get.match(/<\/noinclude\>$/)) { var text = editor.get; var s = text.lastIndexOf(" "); $('#wpFooterTextbox').val(function(i, val) {				return $.trim(text.substr(s+11, text.length-s-11-12).replace(/^\s+|\s+$/g, '') + "\n" + val);			}); editor.set(text.substr(0, s)); }		// clean up text editor // remove trailing spaces at the end of each line .replace(/ +\n/g, '\n')

// remove trailing whitespace preceding a hard line break .replace(/ +/g, ' ')

// remove trailing whitespace and numerals at the end of page text // (numerals are nearly always page numbers in the footer) .replace(/[\s\d]+$/g, '')

// remove trailing spaces at the end of refs .replace(/ +<\/ref>/g, ' ') // remove trailing spaces at the end of template calls .replace(/ +}}/g, '}}') // convert double-hyphen to mdash (avoiding breaking HTML comment syntax) .replace(/([^\!])--([^>])/g, '$1—$2') // remove spacing around mdash, but only if it has spaces on both sides // (we don't want to remove the trailing space from "...as follows:— ",			// bearing in mind that the space will already be gone if at end of line). .replace(/ +— +/g, '—') // join words that are hyphenated across a line break // (but leave "|-" table syntax alone) .replace(/([^\|])-\n/g, '$1');

// clean up pages if they don't have if (!editor.contains(' ')) { editor // lines that start with " should probably be new lines,				// if the previous line ends in punctuation,				// other than a comma or semicolon				// and let's get rid of trailing space while we're at it				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')				// lines that end with " should probably precede a new line, // unless preceded by a comma, // or unless the new line starts with a lower-case letter; // and let's get rid of preceding space while we're at it				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2') // remove single line breaks; preserve multiple. // but not if there's a tag, template or table syntax either side of the line break .replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2') // collapse sequences of spaces into a single space .replace(/ +/g, ' '); }		// more page cleanup editor // dump spurious hard breaks at the end of paragraphs .replace(/\n\n/g, '\n\n')

// remove unwanted spaces around punctuation marks .replace(/ ([;:\?!,])/g, '$1') // unicodify .replace(/&mdash;/g, '—') .replace(/–/g, '–') .replace(/&quot;/g, '"')			// straighten quotes and apostrophes.			.replace(/[“”]/g, '"') .replace(/[‘’`]/g, '\'') //OCR fixes // convert i9 to 19, etc.			.replace(/[il]([0-9])/g, '1$1') // "the", "them", "their", etcetera .replace(/tlie/g, 'the') // "U" -> "ll" when preceded by a lowercase letter. .replace(/([a-z])U/g, '$1ll') // "would", "could" .replace(/woidd/g, 'would') .replace(/coidd/g, 'could') .replace(/shoidd/g, 'should') // many works have apostrophes missing from OCR .replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc .replace(/n t\b/g, 'n\'t') //can't isn't didn't etc .replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc .replace(/\bI m\b/g, 'I\'m') // I'm			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc // expand diacritical templates .replace(//g, '') // replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing .replace(/\{\{float center/g, '{{block center') .replace(/ \s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}'); };	/**	 * As you work your way through the page, when you encounter a reference, just mark it with tags and continue. * Once you've got to the end of the page and proofed the references, simply highlight each reference in turn, * and use this function to move it to its proper position. * @param {object} editor The script helpers for the page. */	var makeReference = function(editor) { _initialise; var editbox = $('#wpTextbox1').get(0); editbox.focus; var refStart = editbox.selectionStart; var refEnd = editbox.selectionEnd;

var firstref = editbox.value.indexOf(' '); if (firstref != -1) { editbox.value = editbox.value.slice(0,firstref+5) + editbox.value.slice(refStart, refEnd) + editbox.value.slice(firstref+5, refStart) + editbox.value.slice(refEnd); }	};

/**	 * Insert formatted references into the footer box if needed. * @param {object} editor The script helpers for the page. */	var addPageFooter = function(editor) { _initialise; var editbox = $('#wpTextbox1').get(0); var footerbox = $('#wpFooterTextbox').get(0); var generic; var format; var f;		if (editbox.value.indexOf(" ") == -1 && editbox.value.indexOf("{{#tag:ref") == -1) { // page contains no refs generic = true; for (f in state.specialFormats) { format = state.specialFormats[f]; if (mw.config.get('wgTitle').contains(format.title)) { footerbox.value = format.footer; generic = false; break; }			}

// no special footer matched, use just strip out the references tag if (generic) footerbox.value = ''; }		else { generic = true; for (f in state.specialFormats) { format = state.specialFormats[f]; if (mw.config.get('wgTitle').contains(format.title)) { footerbox.value = format.footerWithReferences; generic = false; break; }			}

// no special footer matched, so use a generic ref tag if (generic && doGeneric) footerbox.value = '{{block center|}}'; }	};

/**	 * Mark the selected text with. If the text is uppercase, it will be converted to titlecase. * @param {object} editor The script helpers for the page. */	var smallcaps = function(editor) { _initialise; editor.replaceSelection(function(text) {			// Applying small-caps to all-caps text is pointless...			// ... unless the all-caps is OCR of text that is actually small-caps.			// Check if text is all-caps, and if it is, convert it to title case before applying small-caps.			if (text == text.toUpperCase)				text = _titlecase(text);			return '';		}); };

/**	 * Convert the text to uppercase. * @param {object} editor The script helpers for the page. */	var upper = function(editor) { _initialise; editor.replaceSelection(function(text) {			return text.toUpperCase;		}); }; }); //