User:Inductiveload/Scripts/Pagewise DJVU OCR extractor
Appearance
This script creates a text file for each page of a DJVU.
Parameters:
- -i is the input DJVU file
- -o is the output directory
- -p is the number of pages (the script will do the first "p" pages)
- -d is and optional debug flag
Example
[edit]pagewise-ocr.py -i "C:\Documents and Settings\Me\My Documents\input.djvu" -p 100 -o "C:\Documents and Settings\John\My Documents\inputOCR"
Source code
[edit]#!/usr/bin/python
import os
import optparse
import subprocess
def main():
parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
parser.add_option('-i', dest='input', action='store',\
help='input DJVU (required)')
parser.add_option('-p', dest='pages', action='store',\
help='number of pages (required)' )
parser.add_option('-o', dest='output', action='store',\
help='output directory (required)' )
parser.add_option('-d', dest='debug', action='store_true', default=False,\
help='debug flag' )
(opts, args) = parser.parse_args()
# check mandatory options
if opts.pages is None:
print("The input file '-i' must be given\n")
parser.print_help()
exit(-1)
if opts.pages is None :
print("The number of pages (-p) must be given\n")
parser.print_help()
exit(-1)
if opts.output is None :
print("The output file '-o' must be given\n")
parser.print_help()
exit(-1)
PagewiseOCR(opts)
class PagewiseOCR():
def __init__(self, opts):
self.opts = opts
#define djvu directory
self.opts.djvuDir=r"c:\program files\djvuzone\djvulibre" #directory of djvu libre execuatables <--CHANGE ME
for page in range(1, int(self.opts.pages) + 1): #for every page in the
filename = os.path.join(self.opts.output, 'OCRoutput_%04d'%page + '.txt')
if self.opts.debug:
print '\tProcessing page %d'%page
f = open(filename,'w') #create the file
f.close()
cmd = [os.path.join(self.opts.djvuDir, 'djvutxt'), '-page='+str(page), self.opts.input, filename]
subprocess.call(cmd)
if __name__ == "__main__":
try:
main()
finally:
None
#wikipedia.stopme()