User:Ostrea/proofreading.js

/*

This page defines a TemplateScript library. It's not meant to be referenced directly. See TemplateScript for usage.

/* global $, pathoschild */

/** * TemplateScript adds configurable templates and scripts to the sidebar, and adds an example regex editor. * @see https://meta.wikimedia.org/wiki/TemplateScript * @update-token */ // $.ajax('//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', { dataType:'script', cache:true }).then(function {	/*********	** Define library	*********/	pathoschild.TemplateScript.library.define({ key: 'wikisource.proofreading', name: 'Proofreading tools', url: '//en.wikisource.org/wiki/Wikisource:TemplateScript#Proofreading', description: 'A set of scripts for proofreading works in the Page: namespace. This includes tools for cleaning up OCR, generating page templates, and adding common text formatting.', categories: [ {				name: 'Page tools', scripts: [ { key: 'add-header', name: 'Add header', script: function(editor) { addPageHeader(editor); }, forNamespaces: 'page' }, { key: 'add-footer', name: 'Add footer', script: function(editor) { addPageFooter(editor); }, forNamespaces: 'page' }, { key: 'cleanup-ocr', name: 'Clean up OCR', script: function(editor) { pageCleanup(editor); }, forNamespaces: 'page' }, { key: 'make-ref', name: 'Make reference', script: function(editor) { makeReference(editor); }, forNamespaces: 'page' }, { key: 'smallcaps', name: 'Convert to small-caps', script: function(editor) { smallcaps(editor); }, forNamespaces: 'page' }, { key: 'uppercase', name: 'Convert to uppercase', script: function(editor) { upper(editor); }, forNamespaces: 'page' } ]			}		]	});

/*********	** Page context *********/	var state = { initialised: false, // whether the page context has been initialised page: { number: null,  // the djvu page number extracted from the URL proofed: null },		specialFormats: [] // work-specific header template formats };

/*********	** Private methods *********/	/**	 * Initialise the data needed by the page tools. */	var _initialise = function { // only initialise once if(state.initialised) return; state.initialised = true;

// get page metadata var pn = /\.djvu\/([0-9]+)&action=edit/g.exec(location.href); var pq = document.getElementById('pagequality'); state.page = { number: pn !== null ? parseInt(pn[1], 10) : null, proofed: pq && pq.getAttribute('class') && pq.getAttribute('class').match(/quality0|quality[2-4]/) };

// get user-defined work formats // expected format: //  {		//      title: /History of England /, //     evenHeader: '...', //     oddHeader: '...', //     footer: '', //     footerWithReferences: '' //  }		state.specialFormats = []; if(window.specialFormats) state.specialFormats = state.specialFormats.concat(window.specialFormats); };	/**	 * Convert the text to title case based on English rules. * @param {string} text The text to convert. */	 /**	var _titlecase = function(text) { // split text into individual words and examine them one by one var words = text.toLowerCase.split(" "); $.each(function(i, word) {			switch(word) {				case "a":				case "an":				case "and":				case "as":				case "at":				case "but":				case "by":				case "etcetera":				case "etc.":				case "for":				case "from":				case "in":				case "nor":				case "of":				case "o'":				case "on":				case "or":				case "the":				case "to":				case "with":				case "versus":				case "vs.":				case "v.":				case "yet":					break; // don't capitalise articles, "to" as part of an infinitive, prepositions or short conjunctions				default: // capitalise everything else					words[i] = word.substring(0, 1).toUpperCase + word.substring(1, words[i].length);					break;			}		});

// capitalise first word regardless words[0] = words[0].substring(0, 1).toUpperCase + words[0].substring(1, words[0].length);

// capitalise last word regardless var last = words.length-1; words[last] = words[last].substring(0, 1).toUpperCase + words[last].substring(1, words[last].length);

// reconstruct title return words.join(' '); };*/

/*********	** Script methods *********/	/**	 * Add a template to the page. * @param {object} editor The script helpers for the page. */	var addPageHeader = function(editor) { _initialise; if(state.page.number === null) return;

var isEven = (state.page.number % 2 === 0); var generic = true; var headertext = ''; for (var f in state.specialFormats) { var format = state.specialFormats[f]; if (mw.config.get('wgTitle').match(format.title)) { headertext = isEven ? format.evenHeader : format.oddHeader; generic = false; break; }		}

// no special header matched, use a generic running header if (generic) { if (isEven) headertext = ''; // assume verso, with page number at left else headertext = ''; }		$('#wpHeaderTextbox').val(function(i, val) {			return $.trim(val + '\n' + headertext);		});

// if this is unproofed text, then delete the first line of the OCR text, which presumably is raw OCR of the header we've just inserted if (!state.page.proofed) { var text = editor.get; editor.set(text.slice(text.indexOf('\n') + 1)); }	};

/**	 * Clean up OCR errors in the text, and push content at the top * & bottom of the page into the header & footer boxes respectively. * @param {object} editor The script helpers for the page. */	var pageCleanup = function(editor) { _initialise; // push content at the top & bottom into the header & footer if (editor.get.match(/^/)) { var text = editor.get; var e = text.indexOf(" "); $('#wpHeaderTextbox').val(function(i, val) {				return $.trim(val + "\n" + text.substr(11, e-11).replace(/^\s+|\s+$/g, ''));			}); editor.set(text.substr(e+12)); }		if (editor.get.match(/<\/noinclude\>$/)) { var text = editor.get; var s = text.lastIndexOf(" "); $('#wpFooterTextbox').val(function(i, val) {				return $.trim(text.substr(s+11, text.length-s-11-12).replace(/^\s+|\s+$/g, '') + "\n" + val);			}); editor.set(text.substr(0, s)); }		// clean up text editor //rajout .replace(/66\n/g, '\n"')			// remove trailing spaces at the end of each line			.replace(/ +\n/g, '\n')

// remove trailing whitespace preceding a hard line break .replace(/ +/g, ' ')

// remove trailing whitespace and numerals at the end of page text // (numerals are nearly always page numbers in the footer) .replace(/[\s\d]+$/g, '')

// remove trailing spaces at the end of refs .replace(/ +<\/ref>/g, ' ') // remove trailing spaces at the end of template calls .replace(/ +}}/g, '}}') // ajout // remplace --- par longdash .replace(/---/g, '') //remplace les balises par un formatage poème centré petit .replace(/ /g, '') //supprime espace et double espace au début de la ligne .replace(/\n {2}/g, '\n') .replace(/\n {1}/g, '\n') //pareil au début de la page .replace(/^ {2}/g, '') .replace(/^ {1}/g, '') //remplace @ par un formatage début de titre .replace(/@/g, '\n\n\n\n\n') //supprime en-tête .replace(/FAR FROM THE MADDING CROWD.\n/g, '') .replace(/FAR FROM THE MADDING CROWD/g, '') //supprime watermark .replace(/Univ Calif - Digitized by Microsoft®/g, '') .replace(/Univ Calif - Digitized by Microsoft ®/g, '') .replace(/Univ Calif - Digitized by Microsoft/g, '') .replace(/Ⓡ/g, '') //supprime nombres .replace(/[0-9]/g, '') // convert double-hyphen to mdash (avoiding breaking HTML comment syntax) .replace(/([^\!])--([^>])/g, '$1—$2') // remove spacing around mdash, but only if it has spaces on both sides // (we don't want to remove the trailing space from "...as follows:— ",			// bearing in mind that the space will already be gone if at end of line). .replace(/ +— +/g, '—') // join words that are hyphenated across a line break, and weird OCR hyphens (¬) // (but leave "|-" table syntax alone) .replace(/([^\|])[-¬]\n/g, '$1') ;

// clean up pages if they don't have if (!editor.contains(' ')) { editor // lines that start with " should probably be new lines,				// if the previous line ends in punctuation,				// other than a comma or semicolon				// and let's get rid of trailing space while we're at it*/				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')				// lines that end with " should probably precede a new line, // unless preceded by a comma, // or unless the new line starts with a lower-case letter; // and let's get rid of preceding space while we're at it				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2') // remove single line breaks; preserve multiple. // but not if there's a tag, template or table syntax either side of the line break .replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2') // collapse sequences of spaces into a single space .replace(/ +/g, ' ') ;		}		// more page cleanup editor // dump spurious hard breaks at the end of paragraphs .replace(/\n\n/g, '\n\n')

// remove unwanted spaces around punctuation marks .replace(/ ([;:\?!,])/g, '$1') // unicodify .replace(/&mdash;/g, '—') .replace(/–/g, '–') .replace(/&quot;/g, '"')			// straighten quotes and apostrophes.			.replace(/[“”]/g, '"') .replace(/[‘’`]/g, '\'') //OCR fixes // convert i9 to 19, etc.			.replace(/[il]([0-9])/g, '1$1') // "the", "them", "their", etcetera .replace(/tlie/g, 'the') // "U" -> "ll" when preceded by a lowercase letter. .replace(/([a-z])U/g, '$1ll') // "would", "could" .replace(/woidd/g, 'would') .replace(/coidd/g, 'could') .replace(/shoidd/g, 'should') // many works have apostrophes missing from OCR .replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc .replace(/n t\b/g, 'n\'t') //can't isn't didn't etc .replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc .replace(/\bI m\b/g, 'I\'m') // I'm			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc // expand diacritical templates .replace(//g, '') // replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing .replace(/\{\{float center/g, '{{block center') .replace(/ \s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}') // ajouts // remplace "' '" par le bon template //			.replace(/"'/g, '{{" \'}}') //			.replace(/'"/g, '{{\' "}}') // ,"" -> converter /*			.replace(/," /g, ', (')			.replace(/, "/g, ') ') .replace(/; "/g, ';) ')			.replace(/!" s/g, '! (s') .replace(/ -/g, '—') .replace(/!" c/g, '! (c')			.replace(/Mrs /g, 'Mrs. ')			.replace(/Mr /g, 'Mr. ')			.replace(/St /g, 'St. ')

.replace(/ 1 /g, ' I ') .replace(/\(\(/g, '(')			.replace(/,\(/g, ', (')			.replace(/\n" /g, '\n"')			.replace(/: " /g, ': "') //			.replace(/66 /g, '"')

.replace(/\. \./g, '\.\.') //convertisseur anti long s ou f			/* .replace(/fp/g, 'sp') .replace(/fs/g, 'ss') .replace(/ffu/g, 'ssu') .replace(/fex/g, 'sex') .replace(/ffion/g, 'ssion') .replace(/feldom/g, 'seldom') .replace(/fh/g, 'sh') .replace(/fign/g, 'sign') .replace(/fuch/g, 'such') .replace(/foon/g, 'soon') .replace(/addreff/g, 'address') .replace(/ufy/g, 'usy') .replace(/ fy/g, ' sy') .replace(/effi/g, 'essi') .replace(/ ft/g, ' st') .replace(/occafio/g, 'occasio') .replace(/ fent/g, ' sent') .replace(/fup/g, 'sup') .replace(/ufi/g, 'usi') .replace(/leaft/g, 'least') .replace(/fong/g, 'song') .replace(/ufu/g, 'usu') .replace(/faw/g, 'saw') .replace(/effed/g, 'essed') .replace(/fome /g, 'some ') .replace(/laft/g, 'last') .replace(/referv/g, 'reserv') .replace(/kiff/g, 'kiss') .replace(/feem/g, 'seem') .replace(/ fo /g, ' so ') .replace(/filen/g, 'silen') .replace(/fob/g, 'sob') .replace(/hafte/g, 'haste') .replace(/ fide/g, ' side') .replace(/feeing/g, 'seeing') .replace(/feem/g, 'seem') .replace(/feen/g, 'seen') .replace(/to rife/g, 'to rise') .replace(/ceaf/g, 'ceas') .replace(/eferv/g, 'eserv') .replace(/fecr/g, 'secr') .replace(/fc/g, 'sc') .replace(/ifun/g, 'isun') .replace(/fta/g, 'sta') .replace(/fk/g, 'sk') .replace(/fity/g, 'sity') .replace(/fta/g, 'sta') .replace(/fub/g, 'sub') .replace(/maft/g, 'mast') .replace(/hefe/g, 'hese') .replace(/fw/g, 'sw') .replace(/ofom/g, 'osom') .replace(/ſ/g, 's') .replace(/furp/g, 'surp') .replace(/ifed/g, 'ised') .replace(/fay/g, 'say') .replace(/felf/g, 'self') .replace(/pofi/g, 'posi') .replace(/uft/g, 'ust') .replace(/faid/g, 'said') .replace(/fearc/g, 'searc') .replace(/fto/g, 'sto') .replace(/fing([^e])/g, '$1sing$2') //will give trouble with "singed" and "singer" .replace(/efti/g, 'esti') .replace(/rft/g, 'rst') .replace(/moft/g, 'most') .replace(/dift/g, 'dist') .replace(/caft/g, 'cast') .replace(/nft/g, 'nst') .replace(/ufe/g, 'use') .replace(/fome([^n])/g, 'some$1') .replace(/hofe/g, 'hose') .replace(/faft/g, 'fast') .replace(/([^dlih])eft/g, '$1est') //will give trouble with "bereft" .replace(/lfo/g, 'lso') .replace(/horfe/g, 'horse') .replace(/ fet /g, ' set ') .replace(/paff/g, 'pass') .replace(/lofe/g, 'lose') .replace(/poff/g, 'poss') .replace(/fb/g, 'sb') .replace(/eafi/g, 'easi') .replace(/myf/g, 'mys') .replace(/fenfe/g, 'sense') .replace(/ftr/g, 'str') .replace(/taft/g, 'tast') .replace(/mif/g, 'mis') .replace(/ rof/g, ' ros') .replace(/fm/g, 'sm') .replace(/fible/g, 'sible') .replace(/fince/g, 'since') .replace(/fevera/g, 'severa') .replace(/([^hl])eft /g, '$1est ') .replace(/,\n/g, '.\n') //comma placed before a new line is turned into period .replace(/fervant/g, 'servant') */			//tesserakt wrangler .replace(/ſ/g, 's') .replace(/([a-z])\. ([a-z])/g, '$1 $2') .replace(/cd /g, 'ed ') //space to not trigger "anecdote" .replace(/cb/g, 'eb') .replace(/\* /g, '"')			.replace(/\*/g, '')			.replace(/« |«/g, '"') .replace(/© /g, '"')			.replace(/1/g, 'I')			.replace(/""/g, '"') .replace(/< /g, '') .replace(/= /g, '') .replace(/ ir /g, ' it ') .replace(/ Ir /g, ' It ') .replace(/\]|\[/g, 'I') .replace(/ \. /g, ' ') .replace(/ - /g, '—') .replace(/— /g, '—') .replace(/©/g, '') .replace(/\?"\?/g, '?"') //	.replace(/([a-z])\'([a-z])/g, '$1 $2') .replace(/ -([a-z])/g, ' $1') .replace(/([a-z])- /g, '$1 ') .replace(/ ' /g, ' ') .replace(/ \.([a-z])/g, ' $1') .replace(/,,/g, ',') .replace(/([a-z])\.([a-z])/g, '$1 $2') //	.replace(/([a-z]) s /g, '$1\'s ') //	.replace(/([a-z]) s,/g, '$1\'s,') //	.replace(/([a-z]) s./g, '$1\'s.') .replace(/o clock/g, 'o\'clock') .replace(/([a-z]) ll /g, '$1\'ll ') .replace(/([a-z]) t /g, '$1\'t ') .replace(/([a-z]) re /g, '$1\'re ') .replace(/([a-z]) m /g, '$1\'m ') //replaces " being replaced by ' " .replace(/'\n\n"/g, '"') ;	};	/**	 * As you work your way through the page, when you encounter a reference, just mark it with tags and continue. * Once you've got to the end of the page and proofed the references, simply highlight each reference in turn, * and use this function to move it to its proper position. * @param {object} editor The script helpers for the page. */	var makeReference = function(editor) { _initialise; var editbox = $('#wpTextbox1').get(0); editbox.focus; var refStart = editbox.selectionStart; var refEnd = editbox.selectionEnd;

var firstref = editbox.value.indexOf(' '); if (firstref != -1) { editbox.value = editbox.value.slice(0,firstref+5) + editbox.value.slice(refStart, refEnd) + editbox.value.slice(firstref+5, refStart) + editbox.value.slice(refEnd); }	};

/**	 * Insert formatted references into the footer box if needed. * @param {object} editor The script helpers for the page. */	var addPageFooter = function(editor) { _initialise; var editbox = $('#wpTextbox1').get(0); var footerbox = $('#wpFooterTextbox').get(0); var generic; var format; var f;		if (editbox.value.indexOf(" ") == -1 && editbox.value.indexOf("{{#tag:ref") == -1) { // page contains no refs generic = true; for (f in state.specialFormats) { format = state.specialFormats[f]; if (mw.config.get('wgTitle').contains(format.title)) { footerbox.value = format.footer; generic = false; break; }			}

// no special footer matched, use just strip out the references tag if (generic) footerbox.value = ''; }		else { generic = true; for (f in state.specialFormats) { format = state.specialFormats[f]; if (mw.config.get('wgTitle').contains(format.title)) { footerbox.value = format.footerWithReferences; generic = false; break; }			}

// no special footer matched, so use a generic ref tag if (generic && doGeneric) footerbox.value = '{{block center|}}'; }	};

/**	 * Mark the selected text with. If the text is uppercase, it will be converted to titlecase. * @param {object} editor The script helpers for the page. */	var smallcaps = function(editor) { _initialise; editor.replaceSelection(function(text) {			// Applying small-caps to all-caps text is pointless...			// ... unless the all-caps is OCR of text that is actually small-caps.			// Check if text is all-caps, and if it is, convert it to title case before applying small-caps.			if (text == text.toUpperCase)				text = text.toLowerCase;			//	text = _titlecase(text);			return '';		}); };

/**	 * Convert the text to uppercase. * @param {object} editor The script helpers for the page. */	var upper = function(editor) { _initialise; editor.replaceSelection(function(text) {			return text.toUpperCase;		}); }; }); //