Jump to content

User:Jan.Kamenicek/PageCleanUp.js

From Wikisource
Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
  • Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Clear the cache in Tools → Preferences

For details and instructions about other browsers, see Wikipedia:Bypass your cache.

/*jshint boss:true*/
/*global $, mw*/

/**
 * This script adds a toolbar button for cleaning up the OCR text.
 */

( function ( mw, $ ) {

	function cleanUp( text ) {
		text = text

			// Start by trimming leading and trailing whitespace.
			.trim()

			// remove trailing spaces at the end of each line
			.replace(/ +\n/g, '\n')

			// remove trailing whitespace preceding a hard line break
			.replace(/ +<br *\/?>/g, '<br />')

			// remove trailing whitespace and numerals at the end of page text
			// (numerals are nearly always page numbers in the footer)
			.replace(/[\s\d]+$/g, '')

			// remove trailing spaces at the end of refs
			.replace(/ +<\/ref>/g, '</ref>')
	
			// remove trailing spaces at the end of template calls
			.replace(/ +}}/g, '}}')
	
			// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
			.replace(/([^\!])--([^>])/g, '$1—$2')

			// Replace double-em-dash with a two-em bar.
			.replace(/——/g, '{{bar|2}}')

			// Remove spaces around dashes.
			.replace( /\s+—\s+/g, '—' ) // Em dash
			.replace( /\s+–\s+/g, '–' ) // En dash

			// remove spacing around mdash, but only if it has spaces on both sides
			// (we don't want to remove the trailing space from "...as follows:— ",
			// bearing in mind that the space will already be gone if at end of line).
			.replace(/ +— +/g, '—')
	
			// join words that are hyphenated across a line break
			// (but leave "|-" table syntax alone)
			.replace(/([^\|])-\n/g, '$1');

		// clean up pages if they don't have <poem>
		if ( text.indexOf( "<poem>" ) === -1 ) {
			text = text
				// lines that start with " should probably be new lines,
				// if the previous line ends in punctuation,
				// other than a comma or semicolon
				// and let's get rid of trailing space while we're at it
				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')
	
				// lines that end with " should probably precede a new line,
				// unless preceded by a comma,
				// or unless the new line starts with a lower-case letter;
				// and let's get rid of preceding space while we're at it
				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2')
	
				// remove single line breaks; preserve multiple.
				// but not if there's a tag, template or table syntax either side of the line break
				.replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2')
	
				// collapse sequences of spaces into a single space
				.replace(/  +/g, ' ');
		}
		
		// more page cleanup
		text = text
			// dump spurious hard breaks at the end of paragraphs
			.replace(/<br *\/?>\n\n/g, '\n\n')

			// remove unwanted spaces around punctuation marks
			.replace(/ ([;:\?!,])/g, '$1')
	
			// unicodify
			.replace(/&mdash;/g, '—')
			.replace(/&ndash;/g, '–')
			.replace(/&quot;/g, '"')
	
			// straighten quotes and apostrophes.
			.replace(/[“”]/g, '"')
			.replace(/[‘’`]/g, '\'')
	
			//OCR fixes
			// convert i9 to 19, etc.
			.replace(/[il]([0-9])/g, '1$1')
	
			// "the", "them", "their", etcetera
			.replace(/tlie/g, 'the')
	
			// "U" -> "ll" when preceded by a lowercase letter.
			.replace(/([a-z])U/g, '$1ll')
	
			// "would", "could"
			.replace(/woidd/g, 'would')
			.replace(/coidd/g, 'could')
			.replace(/shoidd/g, 'should')
	
			// many works have apostrophes missing from OCR
			.replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc
			.replace(/n t\b/g, 'n\'t') //can't isn't didn't etc
			.replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc
			.replace(/\bI m\b/g, 'I\'m') // I'm
			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're
			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're
			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're
			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc
	
			// expand diacritical templates
			.replace(/{{((ae|oe|\w[:`'~^-]))}}/g, '{{subst'+':$1}}')
	
			// replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing
			.replace(/\{\{float center/g, '{{block center')
	
			// Center tags are converted to the {{center}} template.
			.replace(/<center>\s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}')

			// Remove unwanted ligatures.
			.replace(/fi/, 'fi')
			.replace(/fl/, 'fl')

			;
		return text;
	}

	mw.hook( 'wikiEditor.toolbarReady' ).add( function ( $textarea ) {
		$textarea.wikiEditor( 'addToToolbar', {
			section: 'main',
			group: 'format',
			tools: {
				'Jan.Kamenicek-PageCleanUp': {
					label: 'Page clean-up',
					type: 'button',
					icon: 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6d/Oxygen480-actions-tools-check-spelling.svg/22px-Oxygen480-actions-tools-check-spelling.svg.png',
					action: {
						type: 'callback',
						execute: function () {
							$textarea.val( cleanUp( $textarea.val() ) );
						}
					}
				}
			}
		} );
	} );

}( mediaWiki, jQuery ) );