User:Jan.Kamenicek/PageCleanUp.js

Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.

Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
Opera: Clear the cache in Tools → Preferences

For details and instructions about other browsers, see Wikipedia:Bypass your cache.

Code that you insert on this page could contain malicious content capable of compromising your account. If you are unsure whether code you are adding to this page is safe, you can ask at the central discussion page, Scriptorium. The code will be executed when previewing this page under some skins, including Monobook. You can Purge this page in the interim if you wish to refresh the content sooner under another skin.

Documentation for this script can be added at User:Jan.Kamenicek/PageCleanUp.

/*jshint boss:true*/
/*global $, mw*/

/**
 * This script adds a toolbar button for cleaning up the OCR text.
 */

( function ( mw, $ ) {

	function cleanUp( text ) {
		text = text

			// Start by trimming leading and trailing whitespace.
			.trim()

			// remove trailing spaces at the end of each line
			.replace(/ +\n/g, '\n')

			// remove trailing whitespace preceding a hard line break
			.replace(/ +<br *\/?>/g, '<br />')

			// remove trailing whitespace and numerals at the end of page text
			// (numerals are nearly always page numbers in the footer)
			.replace(/[\s\d]+$/g, '')

			// remove trailing spaces at the end of refs
			.replace(/ +<\/ref>/g, '</ref>')
	
			// remove trailing spaces at the end of template calls
			.replace(/ +}}/g, '}}')
	
			// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
			.replace(/([^\!])--([^>])/g, '$1—$2')

			// Replace double-em-dash with a two-em bar.
			.replace(/——/g, '{{bar|2}}')

			// Remove spaces around dashes.
			.replace( /\s+—\s+/g, '—' ) // Em dash
			.replace( /\s+–\s+/g, '–' ) // En dash

			// remove spacing around mdash, but only if it has spaces on both sides
			// (we don't want to remove the trailing space from "...as follows:— ",
			// bearing in mind that the space will already be gone if at end of line).
			.replace(/ +— +/g, '—')
	
			// join words that are hyphenated across a line break
			// (but leave "|-" table syntax alone)
			.replace(/([^\|])-\n/g, '$1');

		// clean up pages if they don't have <poem>
		if ( text.indexOf( "<poem>" ) === -1 ) {
			text = text
				// lines that start with " should probably be new lines,
				// if the previous line ends in punctuation,
				// other than a comma or semicolon
				// and let's get rid of trailing space while we're at it
				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')
	
				// lines that end with " should probably precede a new line,
				// unless preceded by a comma,
				// or unless the new line starts with a lower-case letter;
				// and let's get rid of preceding space while we're at it
				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2')
	
				// remove single line breaks; preserve multiple.
				// but not if there's a tag, template or table syntax either side of the line break
				.replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2')
	
				// collapse sequences of spaces into a single space
				.replace(/  +/g, ' ');
		}
		
		// more page cleanup
		text = text
			// dump spurious hard breaks at the end of paragraphs
			.replace(/<br *\/?>\n\n/g, '\n\n')

			// remove unwanted spaces around punctuation marks
			.replace(/ ([;:\?!,])/g, '$1')
	
			// unicodify
			.replace(/&mdash;/g, '—')
			.replace(/&ndash;/g, '–')
			.replace(/&quot;/g, '"')
	
			// straighten quotes and apostrophes.
			.replace(/[“”]/g, '"')
			.replace(/[‘’`]/g, '\'')
	
			//OCR fixes
			// convert i9 to 19, etc.
			.replace(/[il]([0-9])/g, '1$1')
	
			// "the", "them", "their", etcetera
			.replace(/tlie/g, 'the')
	
			// "U" -> "ll" when preceded by a lowercase letter.
			.replace(/([a-z])U/g, '$1ll')
	
			// "would", "could"
			.replace(/woidd/g, 'would')
			.replace(/coidd/g, 'could')
			.replace(/shoidd/g, 'should')
	
			// many works have apostrophes missing from OCR
			.replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc
			.replace(/n t\b/g, 'n\'t') //can't isn't didn't etc
			.replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc
			.replace(/\bI m\b/g, 'I\'m') // I'm
			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're
			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're
			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're
			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc
	
			// expand diacritical templates
			.replace(/{{((ae|oe|\w[:`'~^-]))}}/g, '{{subst'+':$1}}')
	
			// replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing
			.replace(/\{\{float center/g, '{{block center')
	
			// Center tags are converted to the {{center}} template.
			.replace(/<center>\s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}')

			// Remove unwanted ligatures.
			.replace(/ﬁ/, 'fi')
			.replace(/ﬂ/, 'fl')

			;
		return text;
	}

	mw.hook( 'wikiEditor.toolbarReady' ).add( function ( $textarea ) {
		$textarea.wikiEditor( 'addToToolbar', {
			section: 'main',
			group: 'format',
			tools: {
				'Jan.Kamenicek-PageCleanUp': {
					label: 'Page clean-up',
					type: 'button',
					icon: 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6d/Oxygen480-actions-tools-check-spelling.svg/22px-Oxygen480-actions-tools-check-spelling.svg.png',
					action: {
						type: 'callback',
						execute: function () {
							$textarea.val( cleanUp( $textarea.val() ) );
						}
					}
				}
			}
		} );
	} );

}( mediaWiki, jQuery ) );