User:Inductiveload/cleanup.js

/* * OCR cleanup script * * Mostly a bunch of regexes and prayer */

/* eslint-disable camelcase, no-restricted-syntax */

( function ( $, mw ) {	'use strict';

const version = '0.1'; const signature = 'wsCleanup';

const DEBUG = 0; const INFO = 1; const ERROR = 2;

const Cleanup = { logLevel: ERROR, enable: true, testFunctions: [], enableTesting: mw.config.get( 'wgTitle' ).endsWith( 'cleanup-test' ), portletCategory: 'page', activeNamespaces: [ 'page' ], actionTitle: 'WsCleanup', additionalOcrReplacements: [], disabledReplacements: [], cleanupFunctions: [], italicWords: [], doLongSReplacements: false, doTemplateCleanup: true, remove_running_header: true, replaceSmartQuotes: true, collapseSuspiciousParagraphs: true, shortLineThreshold: 45, possibleLanguages: [ 'en' ], // 'fr', 'es', 'de', 'zh-pinyin' ], italiciseForeign: true, smallAbbreviations: [], runningHeaderPatterns: [ /^([ivxlcIVLXC.,]+|[iI0-9.,]+)\s+([A-Z[\]\s^*\-–—.,]*)\s*$/, /^([A-Z\s[\]^*\-–—.,]*)\s+([ivxlcIVLXC.,]+|[iI0-9.,]+)\s*$/, /^\s*(\d+|[A-Z[\] ]+)\s*$/ ],		smallAbbrTemplate: 'smaller', editSummary: '/* Proofread */', markProofread: true, cleanupAccesskey: 'c'	};

function log( level, s ) { if ( level >= Cleanup.logLevel ) {

// eslint-disable-next-line no-console let log_fn = console.log;

if ( level >= ERROR ) { // eslint-disable-next-line no-console log_fn = console.error; }			log_fn( 'Cleanup: ', s ); }	}

class CleanupProcessor { constructor {}

process( /* text */ ) { throw new Error( 'Processors must implement process' ); }

name { throw new Error( 'Processors must implement name' ); }	}

function process_editor( editor, processor ) { let text = editor.get; log( INFO, `Processing editor with ${processor.name}` ); text = processor.process( text ); editor.set( text ); }

class WholeWordRegexProcessor extends CleanupProcessor { constructor( reps ) { super; this.reps = reps; }

process( text ) { log( DEBUG, `Making ${this.reps.length} replacements` );

for ( const v of this.reps ) { const good = v[ 1 ]; const bad = v[ 0 ];

const re = new RegExp( '\\b' + bad + '\\b', 'g' );

text = text.replace( re, good ); }			return text; }

name { return 'Generic whole word regexes'; }	}

function pageMayHaveLangs( deniedLangs ) { const hasLangs = Cleanup.possibleLanguages.filter(			( value ) => deniedLangs.includes( value )		); return hasLangs.length > 0; }

class PartialWordRegexProcessor extends CleanupProcessor { constructor( reps ) { super; this.reps = reps; }

process( text ) { log( DEBUG, `Making ${this.reps.length} replacements` );

this.reps.forEach( ( v, i ) => {

const options = v[ 2 ];

let skip = false; Cleanup.disabledReplacements.forEach( ( dv ) => {					if ( dv[ 0 ].source === v[ 0 ].source ) {						// no repl - skip all, else only skip if repl also matches						if ( !dv[ 1 ] || dv[ 1 ] === v[ 1 ] ) {							skip = true;						}					}				} );

if ( skip ) { log( DEBUG, `Skipped disabled replacement: ${v[ 0 ].source} -> ${v[ 1 ]}` ); return; }

if ( options && options.notLangs ) { if ( pageMayHaveLangs( options.notLangs ) ) { log( DEBUG, `Skipped replacement with denied language: ${v[ 0 ].source} (due to ${options.notLangs})` ); return; }				}

if ( options && options.onlyLangs ) { if ( !pageMayHaveLangs( options.onlyLangs ) ) { log( DEBUG, `Skipped replacement as no allowed language: ${v[ 0 ].source} (due to ${options.onlyLangs})` ); return; }				}

try { const newflags = 'g' + v[ 0 ].flags.replace( 'g', '' );

// \b doesn't match useful things like unicode, so fix that up					// this can't do everything but it might help const newSource = v[ 0 ].source; // \b at the the start - replace with non-consuming space-or-start // .replace( /^\\b/, '(?<=^|[\\s\\-;:\'",.!?–—{}\\[]\\|])' );

text = text.replace( new RegExp( newSource, newflags ), v[ 1 ] ); } catch ( error ) { log( ERROR, `Error in ${i}th replacement: ${v}` ); throw error; }			} );			return text;		}

name { return 'Generic partial word regexes'; }	}

/**	 * Make replacements for things that cannot be a suffix in a word, but instead * must be a new word (i.e. a space has gone missing _before_ the match) */	class BannedSuffixProcessor extends CleanupProcessor { constructor( suffix_list ) { super; this.suffix_list = suffix_list; }

process( text ) { for ( const v of this.suffix_list ) { const newflags = 'g' + v.flags.replace( 'g', '' ); const regex = new RegExp( '(\\w+)(' + v.source + ')', newflags );

text = text.replace( regex, '$1 $2' ); }			return text; }

name { return 'Banned suffixes'; }	}

/**	 * Make replacements for things that cannot be a prefix in a word, but instead * must be a previous word (i.e. a space has gone missing _after_ the match) */	class BannedPrefixProcessor extends CleanupProcessor { constructor( prefix_list ) { super; this.prefix_list = prefix_list; }

process( text ) { for ( const v of this.prefix_list ) { const newflags = 'g' + v.flags.replace( 'g', '' ); text = text.replace( new RegExp( '(' + v.source + ')(\\w+)', newflags ), '$1 $2' ); }			return text; }

name { return 'Banned prefixes'; }	}

/**	 * Make replacements for words that cannot stand alone, but would most likely be * suffixes of previous words (i.e. a space has been inserted _before_ the match) */	class OrphanSuffixProcessor extends CleanupProcessor { constructor( reps ) { super; this.reps = reps; }

process( text ) { for ( const v of this.reps ) { const newflags = 'g' + v.flags.replace( 'g', '' ); text = text.replace( new RegExp( '[\\s\\-](' + v.source + '\\b)', newflags ), '$1' ); }			return text; }

name { return 'Orphan suffixes'; }	}

/**	 * Make replacements for words that cannot stand alone, but would most likely be * prefixes of following words (i.e. a space has been inserted _afteR_ the match) */	class OrphanPrefixProcessor extends CleanupProcessor { constructor( reps ) { super; this.reps = reps; }

process( text ) { for ( const v of this.reps ) { const newflags = 'gi' + v.flags.replace( /[gi]/, '' ); text = text.replace( new RegExp( '(\\b' + v.source + ')[\\s\\-]', newflags ), '$1' ); }			return text; }

name { return 'Orphan prefixes'; }	}

/**	 * Wrap selected matches in italics */	class ItaliciseProcessor extends CleanupProcessor { constructor( reps ) { super; this.reps = reps; }

process( text ) { for ( const v of this.reps ) { const newflags = 'g' + v.flags.replace( /[gi]/, '' ); text = text.replace( new RegExp( '(?<!\'\')(' + v.source + ')', newflags ), "$1" ); }			return text; }

name { return 'Italics'; }	}

/*	 * These functions need the original line breaks */	const do_pre_collapse_cleanup = function ( editor ) {

const reps = [

// remove trailing spaces at the end of each line [ / +\n/, '\n' ],

// treat these symbols as hyphens [ /[⌐¬]/, '-' ],

// join words that are hyphenated across a line break // (but leave "|-" table syntax alone)

// Capitals keep their hyphen e.g. non-European [ /([^|])-\n(?=[ÁÀA-ZÉÈÖ])/, '$1-' ], // everything else loses the hyphen [ /([^|])-\n(?=[\w])/, '$1' ] ];

process_editor( editor, new PartialWordRegexProcessor( reps ) ); };

class RunningHeaderProcessor extends CleanupProcessor {

constructor( rh_patterns ) { super; this.rh_patterns = rh_patterns; }

name { return 'Trim running header patterns'; }

process( text ) { text = text.split( /\r?\n/ );

let new_start_line = 0;

for ( const line of text ) { if ( line.trim.length === 0 ) { new_start_line += 1; continue; }

let found = false; for ( const pattern of this.rh_patterns ) { if ( pattern.test( line ) ) { new_start_line += 1; found = true; break; }				}

if ( !found ) { break; }			}

return text.slice( new_start_line ).join( '\n' ); }	}

const do_generic_cleanup = function ( editor ) {

// various cleanup const reps = [ // Digitized by Google (kill) [ /\s?D[ijl]g[ijl]t[ijl][sz][eco]d\s+by[^\n]*\s+([6G][Oo0Q]{2}g[lIf][eco])?/, '' ], [ /\bG[oO0]{2}gle\b/, '' ],

// Remove highly suspicious chars [ /[■•]/, '' ],

// remove trailing whitespace preceding a hard line break [ / +/, ' ' ],

// remove trailing whitespace at the end of page text [ /\s+$/, '' ],

// remove trailing spaces at the end of refs [ / +<\/ref>/, ' ' ],

// remove trailing spaces at the end of template calls [ / +}}/, '}}' ],

// lines containing only punctuation are likely junk [ /^[.,^]$/m, '' ],

// convert double-hyphen to mdash (avoiding breaking HTML comment syntax) [ /([^!])--([^>])/, '$1—$2' ],

// Remove spaces around hyphens between words // Eg. pack -house -> pack-house [ /(\w) ?- ?(\w)/, '$1-$2' ],

// remove unwanted spaces before punctuation marks [ / ([);:?!,.])/, '$1' ],

// ensure spaces after punctuation marks [ /([);:?!,.])([^ 0-9\n}|"'’”])/, '$1 $2' ],

// ...but double punctuation doesn't get any spaces [ /([);:?!,.]) +([\n);:?!,.\]]|$)/, '$1$2' ],

// Double full-stop is probably just (3 or 4 is OK - ellipsis) [ /(\w)\.\. (?=\w)/, '$1. ' ],

// no spaces for inter-numeric punctuation [ /([0-9][,]) +([0-9]{3}(?![0-9]))/, '$1$2' ],

// quotes at start of line can't be a close [ /^(['"]) (?=[A-Za-z])/m, '$1' ],

// quotes at end of line can't be an open [ / (['"])$/m, '$1' ],

// no space in "'s"			[ / ?' ?s([\n ])/, '\'s$1' ],

[ /\( +/, '(' ],			[ / +\)/, ')' ],			[ / *— */, '—' ],

// Date ranges [ /([0-9]{3,4})-([0-9]{2,4})/, '$1–$2' ],

// figures [ / ?, ?ooo/, ',000' ],

// q.v. to q. v. [ /q\.v\./, 'q. v.' ],

// i.e. [ /\bi\.? ?e\.(?!')/, "i.e." ],

// & c. to &c. [ / ?& ?[coe][.,]([,]?)/, ' &c.$1' ],

// this is an old pound noation // with a slash after a space [ /([0-9]) ?[/]\.(?=\s)/, "$1l." ],

// No spaces between num and st/nd/rd [ /([0-9]) (st|nd|rd)\b/, '$1$2' ],

[ /ty(one|two|three|four|five|six|seven|eight|nine|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)/, 'ty-$1' ],

// ﬁ ligature to fi			[ /ﬁ/, 'fi' ],

[ /ſ/, 'f' ],

[ /_/, ' ' ]		];

process_editor( editor, new PartialWordRegexProcessor( reps ) ); };

const do_ocr_fixes = function ( editor ) {

const reps = [

// some apostrophes probably bogus at word start [ /\b([vw])'([a-z])/, '$1$2' ],

// some mis-read full-stops [ /\b(?<=Mr|Mrs|Mssrs|Ms)'/, '.' ],

// ^ -> '' : delete spurious carets [ /(?<=w)\^/, '' ],

// ! -> l			[ /ua!(?=\s)/, 'ual' ],

// / -> f			[ /\/ellow/, 'fellow' ],

// / -> t			[ /(\s)\/he\b/, '$1the' ],

// £ -> f			[ /£f\b/, 'ff' ],

// « -> s			[ /(?<=\w)«(?=\s)/, 's' ],

// $ -> s			[ /(?<=[a-z])\$/, 's' ],

// }' -> y			[ /r}'/, 'ry' ],

// ' -> y			[ /(?<=\b[Vv]er)'/, 'ery' ],

[ />(?=['"])/, '?' ],

// } -> ?			[ /(?<=[a-z]) }/, '?' ],

[ /\('(?=yc)/, 'C' ],

// 'I' -> T			[ /(?<=\W)'[IJ]'(?=\w)/, 'T' ],

// 0 -> O			[ /\b0[*']([BNR])/, "O'$1" ], // Irish names

// 1 -> i [ /(?<=\. )1(?=n|s|t)/, 'I' ], [ /1(?=n|s|t)/, 'i' ], // hard to tell In or in			// avoid units, dates, and "1 of", "1 to" and "1 in" [ / 1 (?![0-9A-Z]|(or|to|in|of)\b|inch|mi\b|mile|ft|foot|cm|cent(i|\b)|dollar|pound|yard|metr|mm|km|kilo|acre|hect[ao])/, ' I ' ],

// 4 -> d			[ /4oor/, 'door' ], [ /e4\b/, 'ed' ],

// 6 -> o			[ /\b6(?=[a-z])/, 'o' ], // 6n, 6f, etc

// 8 -> S			[ /\b8(?=\w|\b)/, 'S' ], // 8o, etc, but not 8o00

// 8i -> th			[ /\b8i/, 'th' ],

// a -> e			[ /(?<=[Jj]udg)a/, 'e' ],

// a -> f			[ /\baf\b/, 'of' ],

// a -> n			[ /\baad/, 'and' ], [ /upoa/, 'upon' ], [ /\bia\b/, 'in' ], [ /(?<=[Rr])emaia/, 'emain' ],

// a -> s			[ /riaon/, 'rison' ], [ /wera\b/, 'wers' ], [ /\beap/, 'esp' ],

// AA -> w			[ /\b(AA|AV)(?=[a-z]{2})/, 'w$1' ], [ /\bnat\b/, 'not' ],

// ae -> nc			[ /aaee(|s|d)\b/, 'ance$1' ],

// Av -> w			[ /Av(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/, 'w$1' ], [ /AV(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/, 'W$1' ], [ /(?<=[a-z])AV\b/, 'w' ],

// Avli -> wh			[ /\bAvli(ich|om?(ever)?|en|ere|ether|y)\b/, 'wh$1' ], [ /\bAVli(ich|om?(ever)?|en|ere|ether|y)\b/, 'Wh$1' ],

// b -> e			[ /\b([Tt])hb/, '$1he' ],

// b -> h [ /\bbow(so|ever|itz|beit)/, 'how$1' ], // watch for bowl... [ /\b(?<=[Tt])be(?=y\b|a\b|se\b|ir\b)/, 'he$1' ], [ /\b(?<=[Ww])b(?=i|e)/, 'h' ], // which, when [ /\bbas(|n't|ten)\b/, 'has$1' ], [ /\bber(|self|eto)\b/, 'her$1' ], [ /\bbim(|self)\b/, 'him$1' ], [ /([Ww])hicb/, '$1hich' ], [ /\b([Ss])bow/, '$1how' ],

// b -> o			[ /(?<=\b[Ss])b/, 'o' ],

// b -> r			[ /mbeb\b/, 'mber' ], [ /dmibal/, 'dmiral' ], [ /xtba/, 'xtra' ], [ /Victobia/, 'Victoria' ],

// B -> E			[ /\b(?<=TH|THR)B/, 'E' ],

// B -> R			[ /Bailw/, 'Railw' ], [ /Boyal/, 'Royal' ], [ /\bFBO/, 'FRO' ],

// c -> e			[ /cx(?![ivxcdm]+\b)/, '$1ex' ], // mind roman numerals [ /becn/, 'been' ], [ /\bbcen/, 'been' ], [ /(C|c)lcar/, '$1lear' ], [ /(a|u|o|p)pces\b/, '$1pees' ], // rupees,... [ /(C|c)asc(\b|(?=\w)[^a])/, '$1ase$2' ], [ /\bwc\b/, 'we' ], [ /(?<=[Ss]t|\b[Tt])cam/, 'eam' ], [ /(S|s)evc/, '$1eve' ], // several/severe [ /([Gg])rcat/, '$1reat' ], [ /([fvh])crence/, '$1erence' ], [ /\b(?<=[Hh])c\b/, 'e' ], // hc -> he			[ /\bcn(?!i)/, 'en' ], [ /\bmcn\b/, 'men' ], [ /((?=\w)[^ao]|\b)rcs/, '$1res' ], // avoid arcs/orcs [ /\Borcs\b/, 'ores' ], // but it can be a suffix of ores [ /\bpcople(|s)\b/, 'people$1' ], [ /\b&e\.(?=\s|$)/, '&c.' ], [ /catc(|d)\b/, 'cate$1' ], [ /\bcight/, 'eight' ], [ /nccessar/, 'necessar' ], [ /\b([Ww])cr/, '$1er' ], [ /([^Aaeou])rcat/, '$1reat' ], [ /\b([Oo])nc(|s)\b/, '$1ne$2' ], [ /(?<=\b[Ss])[ec][ec](?=m|ing)/, 'ee' ], // seem, seeing [ /(?<=g)mics\b/, 'mies' ], [ /(?<=\b[\Ss])tr[ce][ce]ct/, 'treet' ], // street [ /ocict/, 'ociet' ], // society [ /cither/, 'either' ], // cither exists, but... [ /(?<=\b[Ss])[ce][ce](?=d|\b|ing|m)/, 'ee' ], // see, seed, seeing [ /(?<=\b[Ss])c(?=er|ct)/, 'e' ], // seer... (not secretary) [ /(?<![ln])icf/, 'ief' ], // grief [ /c(?=ver|lectr)/, 'e' ], // ever, every, electric [ /(?<=[Pp])copl[ce]/, 'eople' ], // people [ /(?<=[Gg]rac|[Rr]os)c/, 'e' ], // grace, rose [ /(?<=[Cc]ru|[Yy]i)cl/, 'el' ], // cruel, yield, etc [ /cl(?=\b|l|f)/, 'el' ], // inc. scfl -> self [ /ncral(?=(?:s|ly|ity|ities)\b)/, 'neral' ], // general- [ /cth(?!ood|eroy|roat|yma|esis|etic|lip|idro|i\b)/, 'eth' ], // maketh, etc [ /tcd\b/, 'ted' ],

[ /\b(t|tsz|sz)c\b/, '$1e' ], // chinese

// ce -> œ [ /(?<=[Mm]an)ce(?=u)/, 'œ' ],

// ci -> d			[ /(P|p)rociu/, '$1rodu' ], [ /\bacidition(s|)\b/, 'addition$1' ],

// ci -> ici [ /offci/, 'offici' ],

// cnce: ence [ /cnce\b/, 'ence' ],

[ /clves\b/, 'elves' ],

// bom to born [ /\bbom\b/, 'born' ],

// c -> d [ /aciva/, 'adva' ], // advantag...

// c -> g			[ /(\B[^\bzlp])inc\b/, '$1ing' ],

// c -> o			[ /\bcwn/, 'own' ], [ /cc(?=ln|ld|mp|lum|n|resp|s)/, 'co' ], // Lincoln, cold, company, ... [ /\bcc(?=urt)/, 'co' ], // court, not accurtation [ /\bcught/, 'ought' ],

// c -> s			[ /\b([dD])icre/, '$1isre' ], // disregard

// ci -> d			[ /eci\b/, 'ed' ],

// d -> i			[ /\bwdth/, 'with' ],

// d -> o [ /d(?=mp|wn)/, 'o' ], // eg. compose, town [ /fdr/, 'for' ],

// dl -> 31 [ /\b[Sd3]lst\b/, '31st' ],

// e -> a			[ /\bscele(|s|d)\b/, 'scale$1' ],

// e -> c			[ /\be(ome)\b/, 'c$1' ], [ /rcet/, 'rect' ], // direct... [ /struet/, 'struct' ], [ /enee\b/, 'ence' ], [ /expeet/, 'expect' ], [ /((?=\B)[^n]|[oi]n)speet/, 'spect' ], // avoid speet and Nunspeet [ /taeh/, 'tach' ], // detach [ /\bwhieh(|ever)\b/, 'which$1' ], [ /\bfec\b/, 'fee' ], [ /execpt/, 'except' ], [ /([^q])uet(ing|ed)\b/, '$1ucted' ], // conducted [ /&e\./, '&c.' ], [ /(?<=[Uu]n)ele(?=s?\b)/, 'cle' ],

// é -> è [ /ére\b/, 'ère' ], // No words end with acute-e ére

// E -> F			[ /E(rom )/, 'F$1' ],

// e -> o			[ /\bef\b/, 'of' ], [ /\bfrem\b/, 'from' ], [ /\bse\b/, 'so', { notLangs: [ 'es', 'fr', 'zh-pinyin' ] } ],

// e -> r			[ /rthee(?!ls)/, 'rther' ], // further, northern [ /outhee(?!ls|l\b)/, 'outher' ], // southern/ly [ /([^r])eoad/, '$1road' ], // broad

// e -> s			[ /\beo(|uth)\b/, 'so' ], [ /\bthoee\b/, 'those' ],

// el -> d			[ /\belyn/, 'dyn' ], [ /itel\b/, 'ited' ], // cited, united,...

// -eney -> -ency (sad for Sweeny Todd) [ /eney\b/, 'ency' ],

// er -> ev			[ /\berery/, 'every' ],

// é -> c			[ /([aeiou])é(t)/, '$1c$2' ],

// f -> nothing [ /\bhighfer/, 'higher' ],

// f -> i			[ /anfes\b/, 'anies' ], [ /stfan/, 'stian' ],

// f -> l			[ /(?<=[Aa])farm/, 'larm' ],

// f -> t			[ /\b(|in)difterent/, 'different' ], [ /\bfwo/, 'two' ],

// f -> r			[ /(?<=\bB)[ft]it(?=ish|ain)/, 'rit' ],

// ff -> fl			[ /\bff(ood)\b/, 'fl$1' ],

// ff -> ñ [ /(?<=[Ss])paf[ifl]a\b/, 'paña' ],

// g -> ç [ /(?<=Mendon?)ga\b/, 'ça' ], [ /(?<=Gu?on?)g(?=all?o)\b/, 'ç' ], [ /Lorengo/, 'Lorenço' ],

// G -> 6 [ /\bG([0-9]*)th\b/, '6$1th' ],

// h -> b			[ /([Dd])ouht/, '$1oubt' ], [ /\bhe(en)\b/, 'be$1' ], [ /(Oo])hser/, '$1bser' ], // observe [ /\bhio/, 'bio' ], [ /\bemh/, 'emb' ], [ /\bheyo/, 'beyo' ], [ /\bohs\B/, 'obs' ], [ /\bhy\b/, 'by' ], [ /\bhe(?=ings?|en\b|an\b)/, 'be' ], [ /\bhene(?!icos|n|q)/, 'bene' ],

// h -> c			[ /\bhareful(|ly)/, 'careful$1' ],

// h -> im			[ /\bh(?=nony|nonies)\b/, 'im' ],

// h/U -> li			[ /\b(h|U)(fe|ke|ttle)\b/, 'li$2' ], [ /nghs([ht])/, 'nglis$1' ], // English, etc

// h -> n			[ /\bih(?![ilr])/, 'in' ], [ /lahd(?='?s?\b|ing'?s?\b)/, 'land' ],

// h -> li			[ /\bhv[ec](?=s|)\b/, 'live' ], [ /(?=\b[Aa])hve\b/, 'live' ], [ /hng(?=s|ly)?\b/, 'ling' ], [ /dehc/, 'delic' ], // delicate, etc

// h -> lt			[ /cuh(?=(|y)\b)/, 'cult' ], // difficult(y), etc

// H -> li			[ /\bHke/, 'like' ],

// H -> ll			[ /(?<=\bA|[a-z])H/, 'll' ],

// hv -> lw			[ /(?<=[Aa]|ai|l)hvay/, 'lway' ], // always, railway, spillway

// convert i9 to 19, etc.			[ /[il]([0-9])/, '1$1' ],

// i -> 1 [ /\b[Il][Iil]th\b/, '11th' ], [ /(?<=[0-9])ist\b/, '1st' ],

// I -> 1 [ /\bIst\b/, '1st', { notLangs: [ 'de' ] } ],

// i -> nothing [ /\bsomie/, 'some' ], [ /sielf/, 'self' ], [ /\b([Tt])hi(ey|ese)\b/, '$1h$2' ], [ /senise/, 'sense' ], [ /(?<=[Ff])irom/, 'rom' ],

// I -> nothing // See also T -> nothing

// i -> a			[ /\bnime(ed|ly)/, 'namely' ],

// i -> f			[ /\bior(\b|m)/, 'for$1' ], [ /(I|i)nior/, '$1nfor' ], [ /([^m])afi(a|o)/, '$1aff$2' ], [ /\ba[ií]f/, 'aff' ], [ /([rhlf])iei(s|ly|)\b/, '$1ief$2' ], // brief

// i -> j			[ /(in|b|con|de|a)iect/, '$1ject' ], [ /\biett(y|ies)/, 'jett$1' ],

// i -> l			[ /([a-z])abie\b/, '$1able' ], [ /ficuit(|y)/, 'ficult$1' ], [ /enerai/, 'eneral' ], [ /\biab(o|ou)r/, 'lab$1r' ], [ /cicar/, 'clear' ], [ /shali(\b|ow)/, 'shall$1' ], [ /(i)abie\b/, '$1able' ], // reliable, ... [ /reiig/, 'relig' ], [ /([aeiou])riy\b/, '$1rly' ], [ /\b(un|)iaw/, '$1law' ], [ /\bgloi(y|ious)/, 'glor$1' ], [ /tiy\b/, 'tly' ], [ /iais\b/, 'ials' ], // materials... [ /\b(Ii)li(s?\b|ness)/, '$1ll$2' ], [ /(?<=[Ss]e)if/, 'lf' ], // self

// -isli -> -ish [ /(\w)isli\b/, '$1ish' ],

// i -> r			[ /eiy(?![ua])/, 'ery' ], [ /([Ff])iist/, '$1irst' ], [ /([Gg])ieat/, '$1reat' ], [ /\b([Pp])oit(?![ior])/, '$ort' ], // port/ion [ /beied\b/, 'bered' ],

// i -> t			[ /(a|o|i)iion/, '$1tion' ], [ /leci\b/, 'lect' ], [ /aier/, 'ater' ], // material [ /\bmulii/, 'multi' ], [ /\bihe/, 'the' ], // the, there... [ /nir(ies|y)/, 'ntr$1' ], // country [ /\bio(|wards?|gether)\b/, 'to$1' ], [ /\bihat\b/, 'that' ], [ /enily\b/, 'ently' ], [ /ciion/, 'ction' ], [ /(?<=[Bb]u)i/, 't' ], [ /Stewari/, 'Stewart' ],

// i' in a word -> r (not 's)			[ /(?<=[a-z])i'(?=[a-rt-z]|s\w)/, 'r' ],

// i^ > r			[ /(?<=[a-z])i\^/, 'r' ],

// i- -> r (be more careful than ^, - can be right) [ /(?<=Yo)i-/, 'r' ],

// I -> f			[ /\bIor([^gim]|\b)/, 'for$1' ],

// I -> l			[ /\b[l1I]' ?(?=[AEIOUÉÈaeiouéè]\w)/, 'l\'' ],

// I' at word start -> f (except I'd. I'm, I'll, etc) [ /\bI'([a-ce-kn-uw-z])/, 'f$1' ],

// I- -> L			[ /\bI-ord/, 'Lord' ],

// I^ -> P			[ /\bI\^/, 'P' ],

// id -> nl			[ /\boidy/, 'only' ],

// id -> ul			[ /\bshoidd/, 'should' ],

// if -> i			[ /(?<=\b[Oo])if\b/, 'f' ],

// If -> N (happens in cap'd words) [ /\b([A-Z]+)If\b/, '$1N' ],

// ii -> a			[ /\biind\b/, 'and' ], [ /\biimount/, 'amount' ],

// II -> H			[ /\bII(e|[a-z]{2,})\b/, 'H$1' ],

// ii -> h			[ /tiie/, 'the' ], [ /hicii/, 'hich' ], // which

// II -> M			[ /II(?=r|s)/, 'M' ],

// ii -> n			[ /aiis(?!m)/, 'ans' ], [ /co(?:ii|tt)c/, 'conc' ],

// ii -> u			[ /(?<=\b[SsBbMm])ii/, 'u' ], [ /\bii(?!\b|i)/, 'u' ], // avoid roman nums iii [ /iiim(?=s?\b)/, 'ium' ], [ /(?<=[Yy])oii/, 'ou' ],

// ii -> ü [ /(?<=\bHs?)iian\b/, 'üan' ], [ /\bMiiller/, 'Müller' ], [ /\bYii(?=n\b|an\b)/, 'Yü' ], [ /\bTriib/, 'Trüb' ],

// -iiig -> -ing [ /iiig\b/, 'ing' ],

// ij -> h			[ /tija(?!j)/, 'tha' ], [ /([Tt])ij([ae])/, '$1h$2' ],

// il -> H			[ /(\W |\n)il(e|im|er)/, '$1 H$2' ],

// Il -> H			[ /\bIlo(?![ck]no|ilo|ko|na|ne\b|ngot|nka|rin|ts?\b|tycin|well)/, 'Ho' ],

// in -> m			[ /soine/, 'some' ], [ /inod(er|[^e])/, 'mod$1' ], // avoid ..node... [ /ninent/, 'nment' ], // government/s [ /\bcomin([au])/, 'commu$1' ], // community, communication, command [ /\biny(|self)\b/, 'my$1' ], [ /\binen\b/, 'men' ], [ /([^mst])inent/, '$1ment' ], // document... [ /(to|for|by|with|told|tell|let|g[ia]ve|from|towards|[oui]nto|under) ine\b/, '$1 me' ], // ine could be a suffix, so hit the common ones by ngram [ /\bimined/, 'immed' ], [ /\binean(|s)\b/, 'means' ], [ /\bMohainn/, 'Mohamm' ], [ /sinug/, 'smug' ], [ /inforin/, 'inform' ], [ /\bhiin(self|)\b/, 'him$1' ], [ /\b([Ee])nin(i|e)/, '$1nm$2' ], // enmity, enmesh.. [ /\b([Ff])roin\b/, '$1rom' ], [ /([Mm])einb/, '$1emb' ],

// in -> th			[ /(?<=(?:[Ii]n|[Tt]o|[Ff]or) )ine(?=\b|re\b|se\b|ir\b)/, 'the' ],

// io -> w			[ /\bneio(|ly)\b/, 'new$1' ],

// ir -> n			[ /\biir/, 'in' ],

// it -> n			[ /meitt/, 'ment' ],

// iv -> j			[ /\biv(?=st\b)/, 'ju' ],

// iv -> w			[ /\bneiv(|ly)\b/, 'new$1' ], [ /tiveen/, 'tween' ],

// IVI -> M			[ /\bIVI(?=[a-z])/, 'M' ],

// j -> f			[ /\boj\b/, 'of' ],

// j -> i			[ /thjs/, 'this' ],

// J -> I			[ /\bJowa/, 'Iowa' ],

// J -> G			[ /\b\(J(?=uide)/, 'G' ],

// J -> l			[ /\bJibert/, 'libert' ], [ /\b(?<=[Bb])jood/, 'lood' ], // blood

[ /ojher/, 'other' ],

// j -> y			[ /ojal/, 'oyal' ], [ /\b([Mm])anj\b/, '$1any' ], [ /\b([Tt])hej\b/, '$1hey' ],

// Ji -> h			[ /Jiave/, 'have' ], [ /tJie/, 'the' ],

// jl -> d			[ /arjl/, 'ard' ],

// jj -> g			[ /jjht/, 'ght' ],

// j}3^ -> y			[ /(3|j|\})\^/, 'y' ],

// k -> ic			[ /whkh/, 'which' ],

// kl -> d			[ /Eklinb/, 'Edinb' ],

// K -> E			[ /Kng/, 'Eng' ],

// l -> nothing [ /\b(|in)diflferent/, '$1different' ], [ /\beitlher\b/, 'either' ], [ /eaclh/, 'each' ], [ /Clhin(a|ese)/, 'Chin$1' ], [ /(?<=[Ff]l|[Dd]r|ang|[Qq]|iq|)uild/, 'uid' ], // fluid etc [ /(?<=\b[Tt])(?:lh|hl|jh|hj)(?=[ieo])/, 'h' ], // the, these, those, etc

// l -> d [ /listor/, 'distor' ], // distort...

// l -> f			[ /\bol\b/, 'of' ], [ /\bl(orm)\b/, 'f$1' ],

// l -> i			[ /fui(\b|ness\b)/, 'ful$1' ], [ /(d|D)ipio/, '$1iplo' ], [ /(P|p)arll/, '$1arli' ], [ /\bWilllam/, 'William' ], [ /\b([Ff])lc/, '$1ic' ], // fiction [ /\b(Tt])helr/, '$1heir' ], [ /(?<=[Rr]|[Vv]|[Dd]|[Tt]|[g]|[Ff]|[Mm])ellc/, 'elic' ], // relic, delicate,

// l -> I			[ /"\blon(a|ian)/, 'Ion$1' ],			[ /\bl'(ve|ll)\b/, "I'$1" ],			[ /\blt('?s|self)\b/, 'it$1' ],

// l -> h			[ /(a|o)rslip/, '$1rship' ], // scholarship, warships, worship [ /\b([Ww])hicl/, 'which' ], [ /(\w)encl\b/, 'ench' ], // french, bench...

// l ->li [ /\blke/, 'like' ],

// l -> t			[ /([0-9])lh\b/, '$1th' ], [ /\boul/, 'out' ], [ /([Aa])fler/, '$1fter' ], [ /ifl(?=\b|ness|ly)/, 'ift' ], // swift

// la -> h			[ /\bthrougla/, 'through' ], [ /\btla(?<!c)/, 'th' ],

// li -> b [ /\blio([^n])/, 'bio$1' ], // not lion... [ /liject/, 'bject' ], // subject

// li -> lh			[ /\botlier(|s|wise)/, 'others' ], [ /\b([Mm])onarcli(|s|y)/, '$1onarch$2' ],

// lT -> ff			[ /di(lT|flP)ere/, 'differe' ],

// l) -> b			[ /al\) ?le\b/, 'able' ],

// l^ -> f			[ /l\^(?=[a-z])/, 'f' ],

// li -> b			[ /\bliy\b/, 'by' ],

// li -> h ... "the", "them", "their", "with", "much", "here" and whe etcetera [ /([tT][Jl]i)(e|at|is|an|em|ear|eir|en|ither|ose|rough|ree)\b/i, 'th$2' ], [ /\b([SsWw])lie/, '$1he' ], // she, when... [ /\b([Ww])li(at|ole)/, '$1h$2' ], // what, whole [ /(wlicli|ivhic(li|h)|wliich|wiiich|whicli)/, 'which' ], [ /liurcli/, 'hurch' ], [ /\bli(ave|ere|is|ad|ard)/, 'h$1' ], [ /\bIl(is)\b/, 'H$1' ], [ /witli/, 'with' ], [ /mucli\b/, 'much ' ], [ /\blias/, ' has' ], [ /\bwlio/, 'who' ], [ /\b(an|)otlier\b/, '$1other' ], [ /ealtli/, 'ealth' ], [ /([Cc])lii/, '$1hi' ], // China/ese... [ /([SsMu]ucli)/, '$1uch' ], [ /cliann/, 'chann' ], [ /ubhs/, 'ublis' ], // publish [ /\bliate/, 'hate' ], [ /liion/, 'hion' ], // fashion [ /(?<=[Tt])liing/, 'hing' ], // thing [ /(?<=[Nn]e|[Ee])itlier/, 'ither' ], // either, neither [ /(?<=[Cc]|\b)liarm/, 'harm' ],

// li -> k			[ /([LlBb])ooli(\b|s)/, '$1ook\b' ],

// llt -> th			[ /\bllt(e)\b/, 'th$1' ],

// lli -> th			[ /\blli(at|e)\b/, 'th$1' ],

// ln -> b			[ /suln/, 'sub' ], [ /([Hh])md/, '$1ind' ],

// lu -> hi			[ /(?<=[a-z][^li])lucal/, 'hical' ], // -graphical

// m -> in			[ /mg\b/, 'ing' ], [ /\bopm/, 'opin' ], [ /Chm(a|ese)/, 'Chin$1' ], [ /(?<=\b[Pp]la)m/, 'in' ],

// m -> n			[ /\bFramce/, 'France' ], [ /\bFremch/, 'French' ], [ /\bJume\b/, 'June' ],

// m -> on			[ /atim\b/, 'ation' ], [ /\b(V|v)erbation\b/, '$1erbatim' ], // fix verbatim

// m -> rn			[ /ceming\b/, 'cerning' ], [ /\b([Un]w|[Ww])om\b/, '$1orn' ], [ /(?<=[Nn]orth|[Ss]outh|[Ee]ast|[Ww]est)em\b/, 'ern' ], [ /(?<=B[ij[oö])m\b/, 'rn' ], [ /Foumier/, 'Fournier' ],

// m -> un			[ /\bmorth/, 'unorth' ],

// m -> w			[ /\b([Nn])em([^aeo]|\b)/, '$1ew$2' ], // new, newly, news

// mn -> nm			[ /mnent/, 'nment' ],

// mu -> nm			[ /\bumu(?=[aeiou])/, 'unm' ],

// M -> N			[ /\bNongol/, 'Mongol' ],

// n -> a			[ /(G|g)rent/, '$1reat' ], [ /\bns/, 'as' ], [ /ncknow/, 'acknow' ],

// n -> h			[ /\btn(e|a)/, 'th$1' ], [ /\bwn/, 'wh' ], [ /([Ss])mitn/, '$1mith' ],

// n -> in			[ /(?<=[^Eaeiou])ng\b/, 'ing' ], // -ing

// n -> m			[ /(?<=I|i)nperi/, 'mperi' ], // imperial [ /(?<=H|h)inse/, 'imse' ], // himself [ /iun\b/, 'ium' ], [ /(?<=\b[a-z]\w+l)don/, 'dom' ], // no lowercase ends ldon [ /(?<=[Nn])unber/, 'umber' ], [ /stanp/, 'stamp' ], [ /\bn(?=ores?\b|oreover)/, 'm' ],

// n -> o			[ /\bnf/, 'of' ],

// n -> ri			[ /scnb/, 'scrib' ],

// n -> u			[ /\bont (of|the|to|in|a|that|and|for|with|by)\b/, 'out $1' ], // ont may be suffix, filter by common ngram [ /([Nn])nm(?!a)/, 'num' ], [ /snb/, 'sub' ], [ /onsly\b/, 'ously' ], [ /(C|c|w|W|Sh|sh)onld/, '$1ould' ], [ /\b([Th])h(r?)ongh/, '$1h$2ouogh' ], // though, through- [ /\b([Aa])bont\b/, '$1bout' ], [ /thongh/, 'though' ], [ /\b([Cc])onrt/, '$1ourt' ], // court

// na -> m			[ /\b([Hh])ina(|self)\b/, '$1im$2' ],

// ni -> m			[ /(?<=\b|[Hh]ere-?|[Hh]ence-?)froni(?=\b|age|ward)/, 'from' ], [ /(?<=\b[Ww])honi/, 'hom' ], [ /\bhini/, 'him' ], [ /(?<=in|)hunian/, 'human' ], [ /\bnian(?=u|ly|kind)/, 'man' ], // not too general, mind pinyin [ /\brenio/, 'remo' ], [ /\bni(?=ak)/, 'm' ], [ /niouth/, 'mouth' ], // mouth, Plymouth, etc [ /(?<=[Cc]o)ni(?=plet)/, 'm' ], // complete

// ni -> m			[ /\bnie\b/, 'me', { notLangs: [ 'de', 'pl', 'zh-pinyin' ] } ], [ /\bnian/, 'man', { notLangs: [ 'zh-pinyin' ] } ], [ /\btians/, 'trans', { notLangs: [ 'zh-pinyin' ] } ],

// nn -> rm			[ /(?<=[Ff])onn(?!ish)/, 'orm' ], // formula, form, etc

// nv -> rw			[ /nva(?=y|rd)/, 'rwa' ], // afterward, Norway

// o -> a [ /\bouth(or|en)/, 'auth$1' ], // authority... [ /fovo(u?)r/, 'favo$1r' ], [ /\b([Cc])ous([ae])/, '$1aus$2' ], // cause

// o -> c			[ /jeot/, 'ject' ], [ /(?<=[Oo])oo(?=as|i[cp]|u[pl]|lu)/, 'cc' ], [ /(?<=[Oo])co(?=asi|lus|lud|upa|upi|ur)/, 'cc' ], // occasion, occur, [ /(?<=[Ss]uc)oe/, 'ce' ], // success [ /(?<=[Aa]c)o(?=us[ae]|ept|iden|ord)/, 'c' ], // accuse, accept [ /(?<=[Aa]r|ac)oh(?=[io])/, 'ch' ], // archi..., Gracchi,

// o -> e			[ /(?<=dis|\b)rospect/, 'respect' ], [ /turo\b/, 'ture' ], [ /([d])loss/, '$1less' ], // endless [ /\b([Mm])ako\b/, '$1ake' ], [ /\b([Mm])ado\b/, '$1ade' ], [ /noss(?=\b|es|like)/, 'ness' ], [ /\bcomo\b/, 'come', { notLangs: [ 'es' ] } ],

// o -> n			[ /tioos/, 'tions' ], // could be o -> u, but choose one [ /iog(|s)\b/, 'ing$1' ],

// o -> u			[ /egolar/, 'egular' ], // regular

// ol -> d			[ /nolix/, 'ndix' ],

// p -> d			[ /ecorp([^o]?)\b/, 'ecord$1' ],

// p -> f			[ /\bop\b/, 'of' ],

// P -> F			[ /\bP(ee)\b/, 'F$1' ], [ /\bOP\b/, 'OF' ],

// p -> g			[ /inp\b/, 'ing' ], [ /(?<!u)prap/, 'grap' ],

// p -> n			[ /apd\b/, 'and' ],

// p -> o			[ /prth/, 'orth' ],

// P -> ? [ /([a-z])P\b/, '$1?' ],

// q -> o			[ /qf/, 'of' ],

// Q -> G			[ /\bGu(?=ite?|ee[rn]|i[dzvxp]|ir[^o]|in[tq]|iet|ick|ibb)/, 'Qu' ],

// r -> c			[ /jert/, 'ject' ], // object, etc [ /(\w)reive/, '$1ceive' ], // perceive, receive, etc

[ /anrs\b/, 'ani\'s' ], // names ending in ani + 's

// r -> i'			[ /prs\b/, 'pi\'s' ],

// r -> n			[ /\bupor\b/, 'upon' ],

// r -> v			[ /(he|[iasolurn])sire/, '$1sive' ], [ /siveless/, 'siveness' ], // after sire->sive [ /\b(M|m)orement/, '$1ovement' ], [ /\b(G|g)orernment/, '$1overnment' ], [ /\b([Oo])bserr/, '$1bserv' ],

// r -> t			[ /(?<=\b[Ii])r\b/, 't' ],

// r^ -> p			[ /\br\^/, 'p' ],

// ri -> n			[ /(?<=\b[Mm]e)ri\b/, 'n' ],

// ri -> u			[ /ectrial/, 'ectual' ],

// rj -> n			[ /\birj/, 'in' ],

// rn -> m			[ /([aie])urn\b/, '$1um' ], [ /\brern/, 'rem' ], [ /ernent/, 'ement' ], [ /\brn/, 'm' ],

// s -> a			[ /grsph/, 'graph' ], [ /csuse/, 'cause' ],

// s -> m			[ /\b([Ss])ees(ing|ingly|ed|s)\b/, '$1eem$2' ], // seemed

// sb -> sh			[ /\bsb(e|all)\b/, 'sh$1' ],

// sc -> g			[ /insc\b/, 'ing' ],

// t-> c			[ /ettual/, 'ectual' ], [ /fetted/, 'fected' ],

// t -> f			[ /\bot\b/, 'of' ], [ /fitty/, 'fifty' ],

// t -> i			[ /shtp/, 'ship' ], [ /(?<=[Bb]u|[Cc]h|[Mm])tld/, 'ild' ], [ /(?<=[Bb]u|[Gg]u?|[Tt]|[Ss]|[Ff]|[Ww])tlt/, 'ilt' ], [ /\btn\b/, 'in' ],

// T -> nothing (and some I -> nothing) [ /\bw [IT] (?=as|hich|hen|hether|ho)/, 'w' ], // w T as > was, etc

// T -> I			[ /(?<!\bw )\bT(?=\b|t)/, 'I' ], [ /T(?=reland|rish)/, 'I' ],

// t -> l			[ /abte\b/, 'able' ], [ /(?<=[WwCc]|[Ss]h)outd/, 'ould' ],

// t -> r			[ /\b(?<=[Ff])ot(?!h|o|i|u|m|c)/, 'or' ], [ /\b(?<=[Ff])t(ance|ench)/, 'r' ], [ /ntt(?=y|ies)/, 'ntr' ], // country [ /(?<=[Ll]ive)t(?=s|p|\b)/, 'r' ], // liver, Liverpool

// T -> Y			[ /\b(?<=JUL|JOURNE|M|WA)T\b/, 'Y' ], [ /\b(?<=MON|TUES|WEDNES|THURS|FRI|SATUR|SUN|)DAT\b/, 'DAY' ],

// ti -> h			[ /\b([Oo])ttier(?=\b|[^eis])/, '$1ther' ],

// ti -> n			[ /tioti/, 'tion' ],

// ti -> u			[ /\btipon/, 'upon' ],

// to -> h			[ /\bttoe(?![ds]\b)/, 'the' ],

// U -> li, see h/U [ /(?<=\b|[a-z])Uon(?=s?)/, 'lion' ], [ /(?<=[a-z])Ung(?=s?)/, 'ling' ],

// u -> a			[ /Junu([^b])/, 'Janu$1' ], [ /\bund\b/, 'and' ],

// u -> c			[ /([Dd])ouum/, '$1ocum' ],

// u -> h			[ /(?<=\b[Tt])u(?=e[^s]|at\b)/, 'h' ], // the, there, these, etc (not Tuesday)

// u -> n			[ /\baud\b/, 'and' ], [ /meut(\b|[^e])/, 'ment$1' ], [ /siau(|s)\b/, 'sian$1' ], // Persians... [ /\b(P|p)ersou(|s)/, '$1erson$2' ], [ /erument/, 'ernment' ], [ /([Jj])uuc/, 'junc' ], [ /taiu/, 'tain' ], [ /\biu(|to|ward)\b/, 'in$1' ], [ /\bauy(|where|body)\b/, 'any' ], [ /\biuto\b/, 'into' ], [ /kuow/, 'know' ], [ /iug(s|ed|ly|)\b/, 'ing$1' ], [ /auswer/, 'answer' ],

// u -> ii			// [ /(?<=\b[clxv]*)u(?=i*)/, 'ii' ], // roman numerals

// "U" -> "ll" when preceded by a lowercase letter. // "U" -> "li" [ /(?<=[a-z])U(?=c)/, 'li' ], // relic [ /(?<=[a-z])U(?!c)/, 'll' ], // not relic

// un -> m			[ /\bimuned/, 'immed' ],

// ui -> m ... "must", etc [ /\bui(ust)\b/, 'm$1' ],

// v -> r			[ /[Mm]emov/, 'memor' ],

// v -> u			[ /\b([Nn])vm/, '$1um' ],

// v -> y			[ /\bv(ear|our|ou)s?\b/, 'y$1' ], [ /\b(B|b|M|m|the)v/, '$1y' ], [ /\b(A|a)nv(\b|w)/i, '$1ny$2' ], [ /vield/, 'yield' ], [ /encv\b/, 'ency' ], [ /\b(?<=[GgHh])aye\b/, 'ave' ], [ /([Aa])bbev/, '$1bbey' ], [ /demv\b/, 'demy' ], [ /mplov/, 'mploy' ], // employ-... [ /itv\b/, 'ity' ], [ /(?<=[Vv])erv\b/, 'ery' ], [ /(?<=(Mon|Tues|Wednes|Thurs|Fri|\b)da)v(?=s?\b)/, 'y' ],

// v -> w			[ /\bvr/, 'wr' ],

// v^ -> w			[ /\bv[\^/]([a-z])/, 'w$1' ],

// vc -> we			[ /\bvc\b/, 'we' ],

// vd -> wi			[ /vd(ll|th)/, 'wi$1' ],

// V -> m			[ /\bV(iss|rs|r)\b/, 'M$1' ],

// Vh ->Wh [ /\bVh/, 'Wh' ],

// V' -> W			[ /\bV'/, 'W' ],

// Vi -> M			[ /\bVir\b/, 'Mr' ],

// vir -> w			[ /hovir(?!u)/, 'how' ],

// vn -> wi			[ /vn(ll|th)/, 'wi$1' ],

// VV -> W			[ /\bVV(e)\b/, 'W$1' ],

// w -> m			[ /mewt(?!tide)/, 'ment' ],

// w r -> w (not sure what this is about) [ /\bw r (?=e\b|[aeoiu]\w)/, 'w' ],

// X -> N			[ /\bX(?=o)/, 'N' ],

// xv -> w			[ /xvho/, 'who' ], [ /xvay/, 'way' ], [ /txvo/, 'two' ],

// y -> v			[ /([Ss])ery(a|i)/, '$1erv$2' ], [ /tiye(|ly|ness|nesses|s)\b/, 'tive$1' ], [ /eyies\b/, 'evies' ], [ /(?<=\b(?:[Hh]a|[BbGg]ra))ye\b/, 've' ], // have, grave, brave [ /\b([Oo])by(?=\B)/, '$1bv' ], [ /\b(?<=Gene)ya/, 'va' ], [ /\bevent/, 'event' ], [ /vent(?=\b|s|ed|or|ing|y\b|ies|ral|ro|ur|il|ri)/, 'vent' ],

// Y -> T			[ /\bY(?=he)/, 'T' ],

// Y -> V			[ /\b(?<=GENE)YA/, 'VA' ], [ /\bEYENT/, 'EVENT' ], [ /VENT(?=\b|S|ED|OR|ING|Y\b|IES|RAL|RO|UR|IL|RI)/, 'VENT' ],

// z -> x			[ /\bezc/, 'exc' ],

// -> Rome/Roman [ /(E|K)om(e|an|ish)([ .,\n])/, 'Rom$2$3' ],

// d', l', m', n' (not s', or english possesives get messed with) [ /(^|\s)([MmDdLlNnJjSsCc]|[Qq]u|[Jj]usqu)(' | ')(?=[AaEeIiOoUuÁáÀàéÉèÈ])/, "$1$2'" ] ];

process_editor( editor, new PartialWordRegexProcessor( reps ) ); };

const do_multiword_fixes = function ( editor ) {

let reps = [

// hyphens more likely to be em-dash [ /(<?=[a-z])-(the)\b/, '—$1' ],

// Missing spaces // theCap unlikely to be right [ /\b(a|an|of|by|the)(?=[A-Z])/, '$1 ' ],

// single cap in a word probably a dropped space // watch for Mc/Mac // needs lookbehind really [ /\b(\w[a-z]*[abd-z])([A-Z][a-z]+\b)/, '$1 $2' ],

// ance is a suffix when it's not ancestor's prefix [ /[\s-]ance(?! st[or])\b/, 'ance' ],

[ /\bal though/, 'although' ],

// and // not many words start and [ /\band((?=[a-z])[^raoei])/, 'and $1' ],

[ /\bbet ween/, 'between' ],

// I			[ /I(am\b|had|was|will|can|shall|did)/, 'I $1' ],

// he			[ /([Hh]e)(had|did|can|will|was)/, '$1 $2' ],

// him [ /(?<=\b([Hh]im))t/, ' t' ], // e.g. himto -> him to

[ /notbe/, 'not be' ], // cannot be, not being, ...

[ /([deos])n(' | ')t\b/, '$1n\'t' ],

[ /\bcom m/, 'comm' ], [ /(<?=in|\b)com par/, 'compar' ],

// government can only be -a, -s, -e [ /(overnment)((?=\w)[^sae])/, '$1 $2' ],

[ /((?=\w)[^sa])may/, '$1 may' ], // dismay/gamay are the only words end in may

[ /\bme(of|to|for|that)\b/, 'me $1' ],

[ /(s|t)my\b/, '$1 my' ], // -my isn't always a likey suffix

[ /\bof(a|b|c|d|g|m|n|p|s|w)/, 'of $1' ], // of my/self, etc words that can't start of-

[ /\bof(our|my|some|him|her|his)\b/, 'of $1' ],

// of merged left, careful of Russian names... [ /(Earl|Duke|Queen|King|Baron|most|all|some|many)of/, '$1 of' ],

[ /([a-z])which/, '$1 which' ], // only wrong for everwhich

// no word ends -many except overmany [ /([^Oo]?[^v]?[^e]?[^r\s])many/, '$1 many' ],

// she [ /([Ss]he)(had|did|will|was)/, '$1 $2' ],

[ /\bthus(?!ly|\b)/, 'thus ' ], // no words start thus

// some obvious loss of spaces after 'the' [ /\bthe(?=h|me[nm]|mer[c]|mo|im|un|wh)/, 'the ' ],

// and before 'the' [ /\b(\w[^aoniy\s])the\b/, '$1 the' ],

// before 'to' [ /\b(thing)to\b/, '$1 to' ],

[ /(u|n|r) (dices?)\b/, '$1$2' ],

[ /\bun der/, 'under' ],

[ /\brene w(ed|al|abl)\b/, 'renew$1' ], [ /\bre turn/, 'return' ],

// words ending in cious that lost a space [ /cious((?=[a-z])[^enl])/, 'cious $1' ],

// Spurious spaces [ /\b(P|p)ro ceed/, '$1roceed' ], [ /\b(P|p)ro ced/, '$1roced' ], [ /(C|c)on cl/, '$1oncl' ], // con clude [ /(un)?ans wer(a|e|s|\b)/, '$1answer$2' ], [ /same(a|b|c|f|g|h|i|j|k|m|o|p|q|u|v|w|x|y|z)/, 'same $1' ], [ /\bho w/, 'how' ], // however...

[ /\b(dis|)satis fact/, '$1satisfact' ], [ /\bendo (wed|wing|wments?)/, 'endo$1' ],

[ /\bre[ -](quest|quire|solute)/, 're$1' ],

[ /\bwasnot\b/, 'was not' ],

[ /\b(ly)(worked)\b/, '$1-$2' ],

// missing hyphens [ /\binchief(?=s?\b)/, 'in-chief' ], [ /(?<=y)public(?=s?\b)/, '-public' ], // notary-public, ...

// Lone quotes at the start of a quotation [ /(?<=(said|answered|replied|shouted|thought|whispered|murmured|muttered|), ") /, '' ],

// spurious punctuation, eg why. not, but avoid e.g. i.e. etc [ /([a-z]{3,})\. ([a-z])/, '$1 $2' ] ];

process_editor( editor, new PartialWordRegexProcessor( reps ) );

// These are things that are never suffixes // eg. hecould -> he could reps = [ /(c|sh|w)ould(n't)?/ ];

process_editor( editor, new BannedSuffixProcessor( reps ) );

// These can never be prefixes // so insert spaces after then reps = [ /[Aa](?=number|bond\b|comm|rece|reci[^b])/, /a(?=dele)/, /be(?=my)/, /but(?=al)/, // but all, but always /come(?=to)/, /great(?=m|p|r)/, /[HhSsGg]ave(?=my)/, // h/gave my/self /me(?=wit|tow)/, /means/, /of(?=the)/, /sent(?=as)/, /some(?=[cm])/, /that(?=can|d|w)/, // that will /the(?=mes|tr|e\w)/, /(?:un|)usual(?!s|ness|ly)/, /I(?=h[eiou])/, /I(?=ha[^b])/, // I have/had /with(?=a\b|a[^lm]|all)/, /with(?=his|her|it|th|ha)/ ];

process_editor( editor, new BannedPrefixProcessor( reps ) );

// if we see these on their own, they are prefixes of the next word // These can be slightly aggressive, as they only fire if the prefix is		// already isolated - they won't break up existing words let orphans = [ /(a|fo)llo/, // allocate, follow /(un|)acknow/, /(|[Ii]n)conse/, // consequence, consecrate /circum/, /combin?/, /(|[Ii]n)compa/, /(|[iI]n)comple/, /(|[Ii]n)corp/, /\w*corres?/, /diffi/, // difficult, diffident /dis/, // very few words end dis, so an orphan is likely a prefix /decla?/, // ration can't be a simple suffix /ered/, /exper?/, /helio/, /inex/, /medi/, // medicine/s, medical /misbe/, /(|in)oppor/, /(|dis|co-?|acc|in|sub|super)ordin?/, /[Pp]arti/, /[Pp]hilo/, /(|im|mal)prac/, /(|im)practi/, /pre/, // pre is occasionally a suffix, but it's			/(|un)[Pp]rinci/, /reca/, /(|p|un|under)recom/, // recommend /repre/, /(|un|tran)sub/, /suc/, // success... /(|un)sug/, // suggest, sugary../ /sur/, // sur- /trans/, /undis/, /whatso/ ];

process_editor( editor, new OrphanPrefixProcessor( orphans ) );

// if we see these on their own, they're suffixes of the prior word orphans = [ /astic/, /ated/, /atory/, /(|ond|ti)ar(y|ies)/, /tably/, /butors?/, /cating(|ly)/, /cellation(|s)/, /cien(cy|t)/, /ciples?/, /dences?/, /derable/, /digent(|s)/, /dit(y|ies)/, /drawals?/, // only withdrawal /ested(|ly|ness)/, /esque(|ly)/, /ficial\w*/, /geous(|ly|ness|nesses)/, /gences?/, /hend(|s|ing)/, /iast\w*/, /ings?/, // ing is rarely a prefix, much more likely to be -ing if it occurs alone /lants/, /lated/, /lative(s|ly|)/, // comp-, decla- /ligent(|ly|sia|sias)/, /mations?/, // not motions /munication?/, /ments?/, /mence\w*/, // commmence /mitted(|ly|ness)/, /nect(ed|ions?)/, /nence/, /nese/, /nien(ce|ces|ced|t)/, /m?on(ing|ed)/, // summoned, commisioned... /pan(y|ies)/, /pensat\w+/, // compensate /plet(ed|ion|ions)/, /politan\w*/, /pl?oration(|s|al)?/, /rative(s|ly|)/, // comp-, decla- /rit(ies|y)/, /rence(|d|s)/, /saries/, // anniversaries... /sion\w*/, /siderable\w*/, // avoid sideral/sideration /sume(\b|[^r]\w*|r[^i]\w*)/, // avoid -sumeria /stantly/, /tain(ed|s)/, /[as]tr[au]ction(|s|al|ary|ally)/, /[szt]?[aoiue]?tion(|s|al|ally)/, // not ration /tages?/, /ti[vn]ely/, /tinual(|ly|ness|ity)/, /tinuous(|ly|ness)/, /b?ilit(ies|y)/, /vid(es|ing)/, /wered/ ];

process_editor( editor, new OrphanSuffixProcessor( orphans ) ); };

const do_foreign_italics = function ( editor ) { const reps = [ /\bad (hoc|.*um|.*em)\b/, /de facto/, /quid pro quo/, /locum tenens/, /\b[Ii]bid\b/ ];

process_editor( editor, new ItaliciseProcessor( reps ) ); };

const do_whole_words_reps = function ( editor ) {

// simple whole-word replacements const reps = [ ];

process_editor( editor, new WholeWordRegexProcessor( reps ) ); };

const doLongSReplacements = function ( editor ) { const long_s_reps = [ // fix bad long se replacements [ /ƒ/, 'f' ], [ /ʃ/, 's' ],

[ /([^i])fic\b/, '$1sic' ], [ /([Ee])aft/, '$1ast' ], [ /([W])eft/, '$1est' ], // assume Weft is West, but weft is like fabric [ /(af|un)?focia/, '$1socia' ], [ /(?<=[Aa])ff(embl|ign)/, 'ss$1' ], // assign, assemble.. [ /(A|a)nfwer/, '$1nswer' ], [ /(ef)?fent/, '$1sent' ], // essential, sent, sentinel [ /(other|like)wife/, '$1wise' ], [ /\bfide\b/, 'side' ], [ /\bfo\b/, 'so' ], [ /\breft/, 'rest' ], [ /([Aa])bfo/, '$1bso' ], [ /ccef[fs]/, 'ccess' ], [ /bfurd/, 'bsurd' ], [ /affif/, 'assist' ], [ /aff(um|ur|er)/, 'ass$1' ], // assume, assure [ /(?<=A|a)fc/, 'sc' ], // ascent [ /Afia/, 'Asia' ], [ /(?<=A|a)fk/, 'sk' ], // ask [ /aftard/, 'astard' ], [ /aftic/, 'astic' ], [ /afty/, 'asty' ], [ /([Aa])lfo/, '$1lso' ], [ /([Aa])pfe/, '$1pse' ], [ /([Aa])ufp/, '$1usp' ], [ /baffy/, 'bassy' ], [ /([Bb])afe/, '$1ase' ], [ /([Bb]|[Cc]r)eft/, '$1est' ], [ /([Cc])afua/, '$1asua' ], [ /([Cc])auf/, '$1aus' ], [ /([Cc])eaf(?!a)/, '$1eas' ], [ /ceff/, 'cess' ], // necessary [ /cefs\b/, 'cess' ], // princess, process [ /([Cc])heft/, '$1hest' ], [ /Chrif/, 'Chris' ], [ /cife/, 'cise' ], [ /([Cc])laf[fs]/, '$1lass' ], [ /([Cc])lofe/, '$1lose' ], [ /([Cc])onf(id|t|eq)/, '$1ons$2' ], // const, conseq... [ /([Cc])ourfe/, '$1ourse' ], [ /([Cc])oft/, '$1ost' ], [ /([Cc])roff\B/, '$1ross' ], // cross- [ /([Cc])rofs\b/, '$1ross' ], // cross [ /([Dd])efcr/, '$1escr' ], [ /dorf(e|es|ed|ing|ings|ment)/, 'dors$1' ], [ /efer([vt])/, 'eser$1' ], // deserve-, desert- [ /([dD])if([ocprgqst]|ad)/, '$1is$2' ], // dis- [ /\b([dD])if([^f]\w)/, '$1is$2' ], [ /([Dd])iffol/, '$1issol' ], [ /([Dd])efir/, '$1esir' ], [ /efour/, 'esour' ], [ /offef[fs]/, 'ossess' ], [ /feffion/, 'session' ], // session (possesion comes later) [ /(?<![A-Z]|ff|\b)eff(|ed|ion|ing|ly)/, 'ess$1' ], // express, etc [ /([Ee])fpe/, '$1spe' ], // especial [ /([Ee])fq/, '$1sq' ], [ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc [ /(?<=en)lift/, 'list' ], [ /fenf(e|es|ed|ing|ings)\b/, 'sens$1' ], [ /enf(e|es|ed|ing|ings)\b/, 'ens$1' ], [ /([Bb])eft(\b|ed|ing)/, '$1est$1' ], [ /([^kgrdw])eft\b/, '$1est' ], // -est [ /efide/, 'eside' ], [ /(?<=R|r)efort/, 'esort' ], [ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc [ /([Ee])fta/, '$1sta' ], // establish [ /([Ee])fti/, '$1sti' ], // estimate [ /enfes/, 'enses' ], [ /ennf/, 'enns' ], // Pennsylv etc [ /erfal/, 'ersal' ], [ /erfon/, 'erson' ], [ /erfua/, 'ersua' ], [ /erfue/, 'ersue' ], [ /erfui/, 'ersui' ], [ /eruf/, 'erus' ], [ /fa(cr|fe|ga|id|le|lut|lt|tis|w\b|nds?\b)/, 'sa$1' ], [ /\bfay/, 'say' ], [ /\bfa(ve|vi)/, 'sa$1' ], [ /(?<=F|\bf)alf/, 'als' ], // false [ /fatif(?!e)/, 'satis' ], [ /fca([^s])/, 'sca$1' ], // scarce, scant, etc (not briefcase) [ /fchem/, 'schem' ], [ /fc(ie|ious|ure|en|rib|rip)/, 'sc$1' ], // science, conscious, secure [ /fenf/, 'sens' ], [ /fe(a\b|af|cl|co|iz)/, 'se$1' ], // season, seclude, second [ /fee(m|n|ing)/, 'see$1' ], // seen, seem [ /fe(ek|gr|duc)/, 'se$1' ], [ /felec/, 'selec' ], [ /fel(f|v)/, 'sel$1' ], [ /(?<=[Aa]b|[Ii]n)fence/, 'sence' ], [ /fepar/, 'separ' ], [ /feri([eo])/, 'seri$1' ], [ /fervi/, 'servi' ], [ /\bfet(|ting|s|ter)\b/, 'set' ], [ /fettle(\b|m|s)/, 'settle$1' ], // fettle is a word, but settle is way more common [ /feve(ra|n)/, 'seve$1' ], // severla, seven [ /fhew/, 'shew' ], [ /(?<=\ba?)fide(?=s?\b)/, 'side' ], [ /fing(le|u)/, 'sing$1' ], // single, singular [ /fis\b/, 'sis' ], // -sis [ /ffidu/, 'ssidu' ], // Assiduous [ /fh(al|ut|ip|o)/, 'sh$1' ], [ /inifter/, 'inister' ], [ /fidera/, 'sidera' ], // considerable/ation/ate [ /fift(?!h)/, 'sist' ], // subsist, consist [ /filen/, 'silen' ], [ /fign/, 'sign' ], [ /fimi/, 'simi' ], [ /fince/, 'since' ], [ /fion/, 'sion' ], [ /firft/, 'first' ], [ /fite\b/, 'site' ], [ /fitive/, 'sitive' ], [ /fitu/, 'situ' ], [ /flaught/, 'slaught' ], [ /flowl/, 'slowl' ], [ /flowne/, 'slowne' ], [ /fm(an|en|all|oth|ooth)/, 'sm$1' ], // small, helmsmen, smooth [ /focie/, 'socie' ], [ /fole/, 'sole' ], [ /foli/, 'soli' ], [ /folv/, 'solv' ], [ /fome/, 'some' ], [ /foon/, 'soon' ], [ /foph/, 'soph' ], // -sopher/y [ /fourc/, 'sourc' ], [ /fouth/, 'South' ], [ /fov/, 'sov' ], [ /fpade/, 'spade' ], [ /fpawn/, 'spawn' ], [ /fpeak/, 'speak' ], [ /fpec/, 'spec' ], [ /fpee/, 'spee' ], [ /fpir/, 'spir' ], // spirir, spiral, [ /ft(air|an|at|eem|ep|ill|on|oo|r|ud|y)/, 'st$1' ], [ /\bft(\w)/, 'st$1' ], [ /fubf/, 'subs' ], // do before fub [ /fub/, 'sub' ], [ /fucc/, 'succ' ], [ /fuch/, 'such' ], [ /fued/, 'sued' ], [ /\bfu(e|es|ings?)\b/, 'su$1' ], [ /fuf(p)/, 'sus$1' ], [ /fuff/, 'suff' ], [ /fund(?!rais)/, 'sund' ], [ /fumm/, 'summ' ], // summit, summary [ /fuit/, 'suit' ], [ /fuper/, 'super' ], [ /fupp/, 'supp' ], [ /fu(re|rv)/, 'su$1' ], [ /fw(ay|ear|orn)/, 'sw$1' ], [ /fyf/, 'sys' ], [ /fym/, 'sym' ], [ /grefs/, 'gress' ], [ /hift/, 'hist' ], [ /(?<=[Hh])(ea|o|oa|ou)rf/, '$1rs' ], // house, hearse, horse [ /i[sf]cuff/, 'iscuss' ], [ /ifh/, 'ish' ], [ /ifm\b/, 'ism' ], [ /ifo\b/, 'iso' ], [ /ifon/, 'ison' ], [ /iftic/, 'istic' ], [ /([Ii])ffu/, '$1ssu' ], [ /illuf/, 'illus' ], [ /(I|i)nft/, '$1nst' ], [ /\b(?<=i|I)fl/, 'sl' ], // isle, island [ /Jefus/, 'Jesus' ], [ /(?<=J|j|I|i)urif/, 'uris' ], [ /([Jj])uft/, '$1ust' ], [ /([Ll])aft/, '$1ast' ], // last, lastly, etc [ /lefia/, 'lesia' ], [ /([Ll])egif/, '$1egis' ], // legislation... [ /([^ie])efs/, '$1ess' ], // -ess [ /(?<=l|L)eff/, 'less' ], // -ess- [ /lifle/, 'lisle' ], [ /lifh/, 'lish' ], [ /lufiv/, 'lusiv' ], [ /([MmPp])afs\b/, '$1ass' ], [ /([Mm])i(fs\b|ff\B)/, '$1iss' ], // miss, missing [ /([Mm])i(f\B)/, '$1is' ], // mistake [ /Missifippi/, 'Missisippi' ], [ /Missiffippi/, 'Mississippi' ], [ /([Mm])oft/, 'most' ], [ /mongft/, 'mongst' ], [ /([Mm])uft/, 'must' ], [ /nefe/, 'nese' ], [ /nefs/, 'ness' ], [ /nfate/, 'nsate' ], [ /nfel(?=\b|s|led|l[oe]rs?)/, 'nsel' ], [ /nfive/, 'nsive' ], [ /oaft/, 'oast' ], // coast, etc [ /obf/, 'obs' ], [ /([Oo])bfe/, '$1bse' ], // observ [ /ofed/, 'osed' ], [ /offi/, 'ossi' ], // possible [ /ofition/, 'osition' ], // position, etc.			[ /ofity/, 'osity' ], [ /oftil/, 'ostil' ], // hostile [ /ouf\b/, 'ous' ], [ /oufly/, 'ously' ], [ /([Pp])aft/, '$1ast' ], [ /hraf/, 'hras' ], // phrase [ /paff/, 'pass' ], // pass/age, for pafs, see mafs [ /([Pp])leaf/, '$1leas' ], [ /([Pp])of(e|t)/, '$1os$2' ], // post, pose, compose... [ /(?<=P|p)urfu/, 'ursu' ], [ /(?<=R|r)ef([pfs]|en|ume|ump)/, 'es$1' ], [ /([Rr])eleaf/, '$1eleas' ], [ /(?<=R|r)aif(e|i)/, 'ais$1' ], // raising, raised/r [ /\b([Aa]r|[Rr])if([ie])/, '$1is$2' ], // a/rising/ed/es [ /rofec/, 'rosec' ], // prosecute [ /rofef([sf])/, 'rofess' ], [ /rofp/, 'rosp' ], [ /urpof/, 'urpos' ], [ /([Qq])ueft/, '$1uest' ], [ /reafo/, 'reaso' ], [ /refea/, 'resea' ], [ /refi/, 'resi' ], [ /([Tt])afte/, '$1aste' ], [ /(?<=T|t)eft/, 'est' ], [ /terfect/, 'tersect' ], // intersect, but not perfect, etc [ /hefe/, 'hese' ], // these [ /([Hh])ofe/, '$1ose' ], // those, whose [ /tereft/, 'terest' ], [ /traft/, 'trast' ], [ /ranf/, 'rans' ], // trans- [ /ufe/, 'use' ], [ /uftom/, 'ustom' ], [ /vaft/, 'vast' ], [ /(?<=V|v)erf/, 'ers' ], // verse, versus [ /([Vv])eff/, 'vess' ], [ /verf([eyo])/, 'vers$1' ], // verse, verso -versy [ /vife/, 'vise' ], // advise.. [ /([Vv])ifi/, '$1isi' ], [ /ifdom/, 'isdom' ], [ /xift/, 'xist' ] ];

process_editor( editor, new PartialWordRegexProcessor( long_s_reps ) ); };

const template_cleanup = function ( editor ) { const header = editor.forField( '#wpHeaderTextbox' ); const footer = editor.forField( '#wpFooterTextbox' );

// to 		editor.replace( /{{c\|/g, '{{center|' ); header.replace( /{{c\|/g, '{{center|' ); footer.replace( /{{c\|/g, '{{center|' );

// {{rh}} to {{RunningHeader}} header.replace( /\n?{{rh\|/gi, '{{RunningHeader|' );

// more cleanup editor // {{hws}} & {{hwe}} expanded .replace( /{{hws\|/g, '{{hyphenated word start|' ) .replace( /{{hwe\|/g, '{{hyphenated word end|' )

// {{di}} expanded .replace( /{{di\|/g, '{{dropinitial|' )

// {{hi}} expanded .replace( /{{hi\|/g, '{{hanging indent|' )

// {{sm}} expanded .replace( /{{sm\|/g, '{{smaller|' )

// {{...}} replaced // expand diacritical templates // .replace(/{{\.{3}}}/g, '…')

// expand diacritical templates // eslint-disable-next-line no-useless-concat .replace( /{{(ae|oe|\w[:`'~^-])}}/g, '{' + '{subst:$1}}' )

// convert {{—}} to — .replace( /{{—}}/g, '—' );

// M{{sup|c}} to {{Mc}} editor.replace( /M c<\/sup>/g, '{{Mc}}' ); header.replace( /M c<\/sup>/g, '{{Mc}}' );

// section tag fix editor.replace( //g,			'' );

// refs don't have space before them editor.replace( /\s<ref/g, '<ref' ); };

const do_extra_functions = function ( editor ) {

const header = editor.forField( '#wpHeaderTextbox' ); const footer = editor.forField( '#wpFooterTextbox' );

Cleanup.cleanupFunctions.forEach( function ( v ) {			v( editor, header, footer );		} ); };

const do_replaceSmartQuotes = function ( editor ) { // replace smart quotes editor .replace( /“ /g, '"' )			.replace( / ”/g, '"' ) .replace( /[“”]/g, '"' )			.replace( /‘ /g, "'" )			.replace( / ’/g, "'" )			.replace( /[‘’]/g, "'" );	};

const collapse_line_breaks = function ( editor ) { // stuff to do only if the page doesn't contain a tag: if ( editor.get.indexOf( ' ' ) === -1 ) {

// first, a hack! [T230415] const short_line_thresh = Cleanup.shortLineThreshold;

if ( short_line_thresh > 0 ) { const lines = editor.get.split( /\r?\n/ );

for ( let i = 0; i < lines.length - 1; i++ ) { if ( ( lines[ i ].length < short_line_thresh ) &&							lines[ i ].match( /[.!?'"”’—]\s*$/ ) &&							lines[ i + 1 ].match( /\s*['"“‘A-Z0-9]/ ) ) { lines[ i ] += '\n'; }				}

editor.set( lines.join( '\n' ) ); }

editor

// remove single line breaks; preserve multiple. // not if there's a tag, template, table syntax either side of line break .replace( /([^>}\n])\n(?!( *\||[{}<]|\n|=|\*|#))/g, '$1 $2' )

// collapse sequences of spaces into a single space .replace( / +/g, ' ' )

// two quotes are probably two lines .replace( /" "/g, '"\n\n"' ); }	};

// Collapse paras where the second para starts lowercase (so it's probably	// bogus). const collapseSuspiciousParagraphs = function ( editor ) {

if ( editor.get.indexOf( ' ' ) === -1 ) { editor

// remove paragraph breaks if the second para starts lowercase .replace( /\n\n+(?=[a-z])/g, ' ' ); }	};

const do_small_abbrs = function ( editor, abbr_list ) {

for ( const abbr of abbr_list ) {

let re_str = ''; let good = '';

for ( let i = 0; i < abbr.length; i++ ) { re_str += abbr[ i ] + '[.,]? ?';				good += abbr[ i ] + '.'; }

re_str = '(\\s)' + re_str + '(?=\\s)'; // new word, but not in template const re = new RegExp( re_str, 'g' );

const smallAbbrTemplate = 'asc';

good = `$1{{${smallAbbrTemplate}|${good}}}`;

editor.replace( re, good ); }	};

const markProofread = function { // eslint-disable-next-line no-jquery/no-global-selector $( 'span.quality3 input' ).trigger( 'click' ); };

const set_summary = function ( summary_text ) { // eslint-disable-next-line no-jquery/no-global-selector $( '#wpSummary' ).val( summary_text ); };

const do_markProofread = function { // if doing cleanup, must be proofreading markProofread;

if ( Cleanup.editSummary ) { set_summary( Cleanup.editSummary ); // clear old summary }	};

// The main cleanup function // Editor: the templatescript editor object function do_cleanup( editor ) {

// Any clenaups that need the context of the old line breaks do_pre_collapse_cleanup( editor );

// Do this before line collapses if ( Cleanup.remove_running_header ) { process_editor( editor,				new RunningHeaderProcessor( Cleanup.runningHeaderPatterns ) ); }

// Do this first, so we can correct words across collapsed line breaks collapse_line_breaks( editor );

if ( Cleanup.collapseSuspiciousParagraphs ) { collapseSuspiciousParagraphs( editor ); }

// Generic cleanup do_generic_cleanup( editor );

// OCR and scanno fixing

// Do the simple replacements first, as it's easier to write these // if you don't have to guess what intermediate state the page is in		if ( Cleanup.additionalOcrReplacements.length > 0 ) { process_editor( editor,				new PartialWordRegexProcessor( Cleanup.additionalOcrReplacements ) ); }

do_ocr_fixes( editor ); do_multiword_fixes( editor );

if ( Cleanup.italiciseForeign ) { do_foreign_italics( editor ); }

if ( Cleanup.italicWords.length > 0 ) { process_editor( editor, new ItaliciseProcessor( Cleanup.italicWords ) ); }

do_whole_words_reps( editor );

if ( Cleanup.doLongSReplacements ) { doLongSReplacements( editor ); }

if ( Cleanup.doTemplateCleanup ) { template_cleanup( editor ); }

if ( Cleanup.replaceSmartQuotes ) { do_replaceSmartQuotes( editor ); }

do_small_abbrs( editor, Cleanup.smallAbbreviations );

// Any extra functions do_extra_functions( editor );

if ( Cleanup.markProofread ) { do_markProofread; }	}

function do_cleanup_wrapper( editor ) {

log( DEBUG, 'Cleaning up...' );

try { do_cleanup( editor ); } catch ( e ) { log( ERROR, e ); }

log( DEBUG, 'Cleanup done.' ); }

function find_first_diff_pos( a, b ) { const shorterLength = Math.min( a.length, b.length );

for ( let i = 0; i < shorterLength; i++ ) { if ( a[ i ] !== b[ i ] ) { return i;			} }

if ( a.length !== b.length ) { return shorterLength; }

return -1; }

function zip( arrays ) { return arrays[ 0 ].map( function ( _, i ) {			return arrays.map( function ( array ) { return array[ i ]; } );		} );	}

let test_test_to_restore = null;

function do_cleanup_test( editor ) {

const text = editor.get; test_test_to_restore = text;

do_cleanup( editor );

const cleaned = editor.get;

// Load the "expected" subpage and see if the text matches

mw.loader.using( 'mediawiki.api' ).done( function {			const api = new mw.Api;			api.get( { action: 'query', titles: mw.config.get( 'wgPageName' ) + '/expected', prop: 'revisions', rvprop: 'content', rvslots: 'main', formatversion: 2, rvlimit: 1 } ).done( function ( data ) { const expected = data.query.pages[ 0 ].revisions[ 0 ].slots.main.content;

let colour = 'green';

if ( expected !== cleaned ) { log( ERROR, "Expected text doesn't match!" );

const pairs = zip( [ expected.split( '\n' ), cleaned.split( '\n' ) ] );

for ( const pr of pairs ) { if ( pr[ 0 ] !== pr[ 1 ] ) { log( ERROR, 'Line mismatch' ); log( ERROR, `Expected: '${pr[ 0 ]}', Got: '${pr[ 1 ]}'` );

if ( pr[ 0 ] && pr[ 1 ] ) { const indx = find_first_diff_pos( pr[ 0 ], pr[ 1 ] );

log( ERROR, pr[ 0 ].slice( indx ) ); log( ERROR, pr[ 1 ].slice( indx ) ); }							}						}

colour = 'red'; }

// eslint-disable-next-line no-jquery/no-global-selector $( '.wikiEditor-ui' ).css( 'outline', '2px solid ' + colour ); } );		} ); // end using }

function do_cleanup_test_restore( editor ) {

if ( test_test_to_restore ) { editor.set( test_test_to_restore ); }

// eslint-disable-next-line no-jquery/no-global-selector $( '.wikiEditor-ui' ).css( 'outline', '' ); }

function add_templatescript {

$.ajax( '//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', {			dataType: 'script',			cache: true		} ).then( function {

const cleanup_entry = { name: Cleanup.actionTitle, position: 'cursor', script: do_cleanup_wrapper, enabled: true };

if ( Cleanup.cleanupAccesskey ) { cleanup_entry.accessKey = Cleanup.cleanupAccesskey; }

const entries = [ cleanup_entry ];

if ( Cleanup.enableTesting ) { entries.push( {					name: 'Test cleanup',					script: do_cleanup_test				} );

entries.push( {					name: 'Restore pre-cleanup',					script: do_cleanup_test_restore				} ); }

// eslint-disable-next-line no-undef pathoschild.TemplateScript.add(				entries, {					category: Cleanup.portletCategory,					forNamespaces: Cleanup.activeNamespaces				} // common fields			); } );	}

function really_run { log( DEBUG, 'Really_run' ); mw.hook( signature + '.config' ).fire( Cleanup );

if ( Cleanup.enable ) { add_templatescript; } else { log( DEBUG, 'Cleanup disabled' ); }	}

function run { if ( Cleanup.started ) { return; }		Cleanup.started = true; really_run; }

$.when( mw.loader.using( 'user' ), $.ready ).always( run );

// eslint-disable-next-line no-undef }( jQuery, mediaWiki ) );