Jump to content

User:Alien333/poemise+.js

From Wikisource
Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
  • Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Clear the cache in Tools → Preferences

For details and instructions about other browsers, see Wikipedia:Bypass your cache.

// <nowiki>
// Rewrite of some of my own poemise.js (yeah, I know) to test new, very, very promising features. there's an awful lot of experimental values round here. signaled with EVA (Experimental Values Again)
(()=>{
if (mw.config.get("wgCanonicalNamespace") != "Page" || !["edit", "submit"].includes(mw.config.get("wgAction"))) return; // avoid pointless console errors
let arravg = (a) => Math.round(a.reduce((x, y) => x + y, 0)/a.length);
// get the page pagenum of this index as a matrix of grays
let getcolors = (pagenum, colorsdone) => {
	// get the url of the image
	let src = $("#ca-proofreadPageScanLink").children()[0].href.replace(/page(\d+)/, (_, m) => "page"+pagenum).replace(/\d+px/, "150px"); // EVA
	// add the image that'll use this url
	$("body").append(`<img id="poemiseimg" src="`+src+`" crossOrigin="anonymous"/>`);
	let img = $("#poemiseimg")[0];
	img.style.display = "none"; // hide image, it works
	img.onload = () => {
		// add the canvas we'll strap the image on
		$("body").append(`<canvas id="poemisecanv"></canvas>`);
		let canv = $("#poemisecanv")[0]; // HTML element not Jquery selection
		let width = img.naturalWidth;
		let height = img.naturalHeight;
		canv.width = width;
		canv.height = height;
		let ctx = canv.getContext("2d");
		ctx.drawImage(img, 0, 0);
		let data = ctx.getImageData(0, 0, width, height, "srgb").data; // here an array of [r, g, b, a, r, g, b, a, ...]
		canv.remove();
		img.remove();
		let a = []; // matrix of the colors
		let l = []; // line
		for (let i=0;i<data.length;i+=4) { // 4 to ignore the a. Maybe shouldn't? To be looked into
			l.push(Math.round((data[i]+data[i+1]+data[i+2])/3)); // grayscale it
			if (l.length == width) { // finishing a line
				a.push(l);
				l = [];
			}
		}
		colorsdone([a, width, height]);
	};
};
mw.hook("wikipage.editform").add(()=>{
	let thispagenum = Number(mw.config.get("wgTitle").split("/")[1]);
	window.poemisecache = [null, null, null, null]; // preload as much as we can of the getcolors of this page, previous & next
	getcolors(thispagenum, (x) => {window.poemisecache[1] = x;
	getcolors(thispagenum-1, (x) => {window.poemisecache[0] = x;
	getcolors(thispagenum+1, (x) => {window.poemisecache[2] = x});
	})});
	let api = new mw.Api();
	api.get({ // get the wikitext
		action:"parse",
		prop:"wikitext",
		page:(mw.config.get("wgPageName").split("/")[0]+"/"+(thispagenum-1).toString()), // of the previous page
	}).done((a) => window.poemisecache[3] = a.parse.wikitext["*"].includes("|end=")); // and put in [3] whether it contains an |end=
});
window.poemiseplus = ()=>{
let [a,  width, height] = window.poemisecache[1];
// when I say "light" I mean text. It's just that everything is always inverted for me. "dark" is background
// how many good pixels we want on a line. EVA
let enough = 20;
// how close we want stuff to be to the edges to say it's no good. EVA
let edge = 5; // used to be 10, but meh.
// number of times we erode. EVA
let times = 6;
// proportion past which a break is considered meaninful. EVA
let alot = 0.22; // ie 22% of height
// estimation of number of ems per line. EVA
let lheight = 1.7;
// ratio to the mean past which we think a line is in fact two erroneously merged lines
let bigline = 1.6;
// a sort of average value. taking a slightly lower one works better, because there are much more light pixels than dark pixels, this perfectly moves the average so that it's outside of the huge background spike
let m = arravg(a.map(arravg))*0.9; // EVA
// classifying each pixel as dark or light by comparing to m
let t = a.map((l, i) => l.map((x, j) => a[i][j] <= m));
// Get Number In Square (would also work with rectangles); gnis(i,j,w) gives the number of trues in the square of top left corner (i,j) and of side w in t
let gnis = (i, j, h, w=h) => {
	let s = 0;
	for (let a=i;a<Math.min(i+h, height);a++) {
		for (let b=j;b<Math.min(j+w, width);b++) {
			if (t[a][b]) {
				s ++;
			}
		}
	}
	return s;
};
// and now we can remove dots. Essentially the idea is:
for (let i=0;i<height-9;i++) {
	for (let j=0;j<width-9;j++) {
		if (gnis(i,j,9) == gnis(i+3,j+3,3) && // if the square of side 9 from here has as many trues as the square of side 3 in its center, i.e. all the rest is false
		gnis(i+3,j+3,3) != 0) { // and that number is not 0
			for (let a=i+3;a<i+6;a++) {
				for (let b=j+3;b<j+6;b++) {
					t[a][b] = false; // remove the dot from true
				}
			}
		}
	}
}
// erode horizontally an array array; make all left & right neighbours of true into true
let erode = (a, width) => a.map((l, i) => l.map((x, j) => (a[i][j] || (j > 0 && a[i][j-1]) || (j < width-1 && a[i][j+1]))));
// remove any patches of true that touch the edges
let unnoise = (a, width) => {
	a.forEach((l, i) => {
		let j = 0; while ((a[i][j] || j < edge) && j < width) {a[i][j] = false; j++} // left
		j = width-1; while ((a[i][j] || width-j < edge) && j > -1) {a[i][j] = false; j--} // right
	});
};
// make the trues larger, to link stuff that's nearly but not exactly at the edges
t = erode(t, width);
// remove noise from scanning and paper
unnoise(t, width);
// make a copy
let h = t.map(x => x);
// erode, to make scattered points into a more substantial mass
for (let k=0;k<times;k++) { 
	h = erode(h, width);
}
// check for each line, whether there are more than enough good pixels in the line. that allows us to define a "line of pixels containing text". WARNING: here we change dimensions, we crush all lines to a single value
h = h.map((l) => l.map(x => x?1:0).reduce((x, y) => x + y, 0) >= enough);
// crop noise at top and bottom
let i = 0; while ((h[i] || i < edge) && i < height) {h[i] = false; i++}
i = height-1; while ((h[i] || height-i < edge) && i > -1) {h[i] = false; i--}
// now we're going to try and group the lines of pixels into lines of text. ie being able to say "the nth line goes from the ith line of pixels to the jth line of pixels". here respectively inclusive and exclusive
let l = [];
let text = true;
let j = h.findIndex(x => x); // where the current line started
for (let i = h.findIndex(x => x);i<height;i++) {
	if (h[i] && !text) { // this is the beginning of a line
		text = true;
		j = i;
	}
	if (!h[i] && text) { // end of a line
		text = false;
		l.push([j, i]); // add it to the list
	}
}
// eliminate stuff one line high, probably error or rules. EVA. thanks to width 150 and not 100 normally this never catches text lines. as the interval is [inclusive; exclusive[, x[1]-x[0] is always >= 1. > 1 eliminates = 1, when x[0] is the only one
l = l.filter(x => x[1]-x[0] > 1);
// remove the first line if there's a header.
if ($("#wpHeaderTextbox").val().trim() != "") { l = l.slice(1) }
// sometimes the ascenders and descenders come so close we count it as one line. that's very bad. so we're going to get the height of each line:
let heis = l.map(x => x[1]-x[0]);
// and then get the average
let mhei = arravg(heis);
// and then we're going to count everything bigline times as big as that as multiple merged lines, and split it
let l_ = [];
l.forEach(x => {
	let hei = x[1]-x[0];
	if (hei >= mhei*bigline) {
		let howmany = Math.round(hei/mhei); // how many lines are merged here exactly?
		let subhei = Math.floor(hei/howmany); // probable height of each of the merged lines
		for (let k=0;k<howmany;k++) { // add the kth line
			l_.push([x[0]+subhei*k, x[0]+subhei*(k+1)-1]);
		}
	} else {
		l_.push(x);
	}
});
l = l_;
// this is the number of lines, not counting unwrapping
let nlines = l.length;
// now, time to unwrap using the text, else it'll mess up with the indenting. at this stage, we assume that the user has done the OCR corrections, and very importantly has moved the header
let s = $("#wpTextbox1").val().trim().split("\n").filter(x => x != "");
let neot = s.includes("$"); // No End Or Title if there's a $ on its own line
if (neot) {
	s.splice(s.indexOf("$"), 1); // remove the $
}
i = 0; // index. doesn't always increment, when we collapse stuff
while (i < s.length) {
	if (i > 0 && (/[a-z\^]/.test(s[i][0]) || s[i].length < 3)) { // starts by lowercase || two or fewer characters
		l[i-1] = [l[i-1][0], l[i][1]]; // collapse i into i-1
		l.splice(i, 1); // remove i
		s[i-1] =
			(s[i-1][s[i-1].length-1]=="-" && s[i-1][s[i-1].length-2]!="-")?(s[i-1].slice(0, s[i-1].length-1)+s[i]): // a word was broken, collapse the dash
			(s[i-1][s[i-1].length-1]=="—")?(s[i-1]+s[i]): // it was an emdash, in which case leave it but don't add a space
			(s[i][0] == "^")?(s[i-1]+" "+s[i].slice(1)): // user specified that this should unwrap, remove the ^ indication
			(s[i-1]+" "+s[i]); // standard case, add a space
		s.splice(i, 1);
	} else {
		i++;
	}
}
// now we measure the breaks between lines.
let br = Array.from(new Array(l.length-1), (x,i) => l[i+1][0]-l[i][1]);
// and we get the average break, which will be the separator to determine later what's a stanza break and what's not. Why +1? EVA.
let mbr = arravg(br)+1;
// add the line collapsing to h (this is only useful for debugging, but debugging is ''very'' useful)
h = Array.from(h, ()=>false); // put all to false
l.forEach(([s, e]) => {
	for (let i=s;i<e;i++) { // put everything inside of a line as text
		h[i] = true;
	}
});
// with that, all line-based stuff is done. only left to do column-based stuff. So we crush t column-wise, by replacing each line by the index of its first light pixel
let v = t.map(x => x.includes(true)?x.indexOf(true):width);
// this function gets the minimun indent from the beginning of text line s to the end of text line e, in pixels from left edge
let indent = (s, e=s) => v.slice(s[0], e[1]).reduce((x,y) => Math.min(x, y), width);
// next we get the pattern, including stanza breaks
let pattern = [];
for (let i=0;i<l.length;i++) {
	if (i > 0 && br[i-1] > mbr) { // if there's a stanza break before
		pattern.push(".");
	}
	pattern.push(indent(l[i])); // here it's in pixels, not ems. that's done later
}
// we'll need this. it gets in a slightly sloppy & repetitive way if a page has big top/bottom margins
let minmaxe = ([a, width, height]) => {
	// no comments here, as it essentially copies the code up there (yeah, IK, ought to factorise)
	let m = arravg(a.map(arravg))*0.9;
	let t = a.map((l, i) => l.map((x, j) => a[i][j] <= m));
	t = erode(t, width);
	unnoise(t, width);
	let h = t.map(x => x);
	for (let k=0;k<times;k++) {
		h = erode(h, width);
	}
	h = h.map((l) => l.map(x => x?1:0).reduce((x, y) => x + y, 0) >= enough);
	let i = 0; while ((h[i] || i < edge) && i < height) {h[i] = false; i++}
	i = height-1; while ((h[i] || height-1-i < edge) && i > -1) {h[i] = false; i--}
	return [h.findIndex(x => x)/height >= alot, (height-h.findLastIndex(x => x))/height >= alot];
};
// is this a title page?
let ist = (!neot &&
	!window.poemisecache[3] && // either it's null, and the query failed, or it's false, and the preceding page doesn't contain a |end=. If it's true, this clause is false and we exit
	(pattern.join(",").match(/^\d+?,\./) // a single-line stanza at top
	|| l[0][0]/height >= alot // large break at top
	|| (window.poemisecache[0] && minmaxe(window.poemisecache[0])[1]))); // large break at bottom of preceding
// the indent baseline (where we don't put any :s)
let ibl = indent(l[ist?1:0], l[l.length-1]);
// if it's a title, remove it from the pattern
if (ist) {
	pattern = pattern.slice(pattern.indexOf(".")+1);
}
// number of stanza breaks
let nbreaks = pattern.filter(x => x == ".").length;
// number of pixels per ems, calculated using the height of the nlines and nbreaks, their numbers, and lheight
let ppe = (l[l.length-1][1]-l[0][0])/((nlines+nbreaks)*lheight);
// which allows us to transform the aforementioned px indents from left edge into em indents from ibl
pattern = pattern.map(x => (x == ".")?".":(Math.round((x-ibl)/ppe).toString()));
// then we make a double-check: substract to all numbers the minimum number. this is to avoid a situation like [[Page:Poems Taylor.djvu/70]], where a dot on the left indents everything multiple levels. note: this doesn't catch the occurences where the dot is interpreted as a line
let minim = pattern.reduce((a,b) => a=="."?b:b=="."?a:a<b?a:b, width);
pattern = pattern.map(x => (x == ".")?".":(x-minim));
// and now we can get (thanks to the unwrapped s) the body of our ppoem
let poem = "";
j = 0; // index in s
for (let i=0;i<pattern.length;i++) {
	if (s[j] == undefined) { // we have lines in pattern where we don't have text. probably the footers, ignore and crop so that it doesn't mess up rhyme later
		pattern = pattern.slice(0, i);
		break;
	}
	if (pattern[i] != ".") { // if not a stanza break
		if (j > 0 || !ist) {
			poem += ":".repeat(pattern[i])+s[j];
		} else { // we get here if j == 0 and ist, i.e. this is the title
			i--; // to compensate the i++ and not waste anything of the pattern
		}
		j++;
	}
	poem += "\n";
}
poem = poem.trim(); // remove whitespace
if (s[j] != undefined) {
	alert("Some text was not matched. Probably unclean image. Not doing anything.");
	//return; // there's text we skipped. that's bad.
}
// that's most of the hard work done. the next part, which is also a bit tricky, is determining what exactly we put around our body.
let before, after;
// we'll need to try and find if there's a repeating pattern. the whole comma mess is needed because of possible multiple-digit indents
let rhyme = (pattern.join(",")+",").match(/^(.+)\1+/); // repeating motif
let rhyme_ = (pattern.join(",")+",.,").match(/^(.+)\1+/); // the motif we'd get if we speculated a stanza break at the end
if (rhyme) { // if we have one like this
	if (!rhyme_ || rhyme[1].length >= rhyme_[1].length) { // and either we don't have _, or we have a longer one like this
		rhyme = rhyme[1]; // then take rhyme
	} else if (rhyme_) { // else if we have rhyme_ take it
		rhyme = rhyme_[1];
	} // implictly: else take the null of !rhyme
} else if (rhyme_) { // take it
	rhyme = rhyme_[1];
}
if (rhyme) { // remove empty elements of rhyme. can happen bc of the comma story
	rhyme = rhyme.split(",").filter(x => x != "");
}
// What should go before?
if (ist) { // title?
	before = "{{tpp|"+s[0]+"|\n"; // then we use tpp
} else {
	before = "{{ppoem|\nstart="+
		((rhyme && rhyme[rhyme.length-1]==".")?"stanza":"follow") // if the character that would have come right before is a ., put stanza, else follow
		+"|\n";
}
// And after?
// decide whether this is a poem's end
let noend = (!neot &&
	(height-l[l.length-1][1])/height >= alot || // break at bottom of this
	(window.poemisecache[2] &&  minmaxe(window.poemisecache[2])[0])); // break at top of next
if (noend) {
	after = "\n}}";
} else {
	after = "\n|end="+
		((rhyme && rhyme[pattern.length % rhyme.length]==".")?"stanza":"follow") // If the next character in rhyme is a ., put stanza, else follow
		+"\n}}";
}
let res = before+poem+after; // result
$("#wpTextbox1").val(clean(res)).trigger("input");
// show a on a canvas with squares in a text line after its start tinted blue, and a chart of number of pixels of each value, for debugging
let sep = Math.round(m);
// where we'll put the page
if ($("#can").length == 0) {
	$("body").append("<canvas id='can' style='z-index:10;position:absolute;top:0'></canvas>");
}
let can = $("#can")[0];
can.width = width;
can.height = height;
// the chart
if ($("#cha").length == 0) {
	$("body").append("<canvas id='cha' style='z-index:10;position:absolute;top:0;right:0;border:1px solid black'></canvas>");
}
let cha = $("#cha")[0];
cha.width = 256;
cha.height = 200;
let ctx = can.getContext("2d");
// actually draw on the canvas
for (let i=0;i<height;i++) {
	for (let j=0;j<width;j++) {
		ctx.fillStyle = "rgba("+(((h[i] && j >= indent([i, i+1]))||j==ibl)?0:a[i][j])+","+a[i][j]+","+a[i][j]+",255)"; // if f(i,j), remove all the red in the pixel, so that it becomes blue
		ctx.fillRect(j, i, 1, 1);
	}
}
ctx = cha.getContext("2d");
ctx.fillStyle = "rgba(255,255,255,255)";
ctx.fillRect(0,0,256,200); // just filling the background
let counts = Array.from(new Array(256), (_,x)=>a.flat().filter((y) => y == x).length); // gives for each value in [0, 255] the number of pixels of that value
let ma = Math.max(...counts);
counts = counts.map(x => 200*x/ma); // we scale it down to 200px
for (let i=0;i<256;i++) {
	ctx.fillStyle = "rgba("+(i==sep?"255,0,0":"0,0,255")+",255)"; // if we're doing sep than red else blue
	ctx.fillRect(i, i==sep?0:(200-counts[i]), 1, 200); // if we're doing sep than full height else counts[i]
}
setTimeout(() => {can.remove();cha.remove()}, 10000); // erase it after 10 seconds
};
})();
// </nowiki>