User:Inductiveload/Scripts/DJVU OCR
Jump to navigation
Jump to search
This is a script to perform Tesseract-based OCR on a DJVU file. It requires that you have Tesseract and DjvuLibre, and if you want to convert to bitonal images, you also need ImageMagick.
Inspired by this Perl script. This script is designed for Linux, but it can be modified to run on Windows by changing the file paths as needed.
- To OCR a DJVU file, and update (-u) the text layer with new text. Without -u, the DJVU will be unaltered.
python -i ~/INFILE.djvu -u
- Enable debugging mode (reports progress), use -d
python -i ~/INFILE.djvu -d
- Enable Tesseract output, use -t
python -i ~/INFILE.djvu -t
- Use ImageMagick to convert to bitonal with a given threshold
python -i ~/INFILE.djvu -b 50%
- Output a human-readable transcript of the DJVU file to a given location
python -i ~/INFILE.djvu -o ~/TRANSCRIPT.txt
- Use a Tesseract language other than English
python -i ~/INFILE.djvu -l LANG
The code
[edit]#!/usr/bin/env python
import os
import glob
import subprocess
import re
import optparse
def main():
parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
parser.add_option('-i', dest='djvu', action='store',\
help='the source djvu file to perfrom OCR on')
parser.add_option('-l', dest='lang', action='store', default='eng',\
help="OCR language (default: 'eng')" )
parser.add_option('-d', dest='debug', action='store_true', default=False,\
help='enable debugging information' )
parser.add_option('-t', dest='tess_out', action='store_true', default=False,\
help='enable tesseract output' )
parser.add_option('-c', dest='clean', action='store_true', default=False,\
help='use textcleaner and imagemagick to convert the image to bitonal black and white' )
parser.add_option('-o', dest='output', action='store',\
help='output a human readable text file to a given file path' )
parser.add_option('-u', dest='update', action='store_true', default=False,\
help='update the djvu file text layer' )
(opts, args) = parser.parse_args()
# check mandatory options
if opts.djvu is None:
print("The input file '-i' must be given\n")
class DjvuTesseract():
def command(self, command, out=False, err=False):
"""Use subprocess.Popen" to run a command on the terminal and return the s result
Required for python 2.6 since subprocess.check_output doesn't exist
This function will trash output unless you explicitly ask it not to
with quiet=False. This is so tesseract won't spam you with rubbish"""
if out:
std_out = subprocess.PIPE
std_out = None
if not err:
std_err = subprocess.PIPE
std_err = None
proc = subprocess.Popen(command, stdout = std_out, stderr=std_err)#std_out)
out, err = proc.communicate()
return out, err
def complete(self):
"""Prints a "complete" message if debugging is on"""
if self.opts.debug:
print "complete"
def calculate_djvu_length(self):
cmd = ['djvused', self.opts.djvu, '-e', 'n']
out, err = self.command(cmd, out=True)
self.num_pages = int(out)
if self.opts.debug:
print "(INF) number of pages: %d" % self.num_pages
def format_ocr_text(self, page):
"""Format a page's OCR'd text into a DJVU friendly form"""
#read out of the text file that tesseract made
ocr_text = open(self.ocr_text, 'r')
# write into this file
djvu_text = open( self.djvu_text, 'w' )
text = "(page 0 0 1 1\n"
self.out_text.write('\n## Page %d ###\n\n' % page )
for line in ocr_text:
#write to the human readable file
# add each line of text
# escaping " to \" as we go
text += '(line 0 0 1 1 "%s")\n' % line.replace('\\', '\\\\').replace('"', '\\"').strip()
text += ")\n"
djvu_text.write( text )
def process_pages(self):
for page in range(1, self.num_pages+1): #djvu pages are 1-indexed
if self.opts.debug:
print "\n\t(INF) Processing page %d" % page
if self.opts.debug:
print "\t(INF) Extracting DjVu page to image . . .",
# Extract page an image
cmd = ['ddjvu', '-format=tiff', '-page=%d' % page, self.opts.djvu, self.temp_img]
out, err = self.command(cmd)
#Cleanup image
if self.opts.clean:
if self.opts.debug:
print "\t(INF) Applying textcleaner . . .",
# apply text cleaner
cmd = ['./textcleaner', self.temp_img, self.temp_img]
out, err = self.command(cmd)
if self.opts.debug:
print "\t(INF) Applying bitonal conversion . . .",
# apply text cleaner
cmd = ['convert', self.temp_img, '-threshold', '50%', self.temp_img]
out, err = self.command(cmd)
if self.opts.debug:
print "\t(INF) Beginning OCR. . .",
# Perform OCR on the image
cmd = ['tesseract', self.temp_img, self.temp_ocr, '-l', self.opts.lang]
out, err = self.command(cmd, err=self.opts.tess_out)
# convert the OCR'd text to a DJVU friendly fomat and a human-friendly format
# update the DJVU text layer
if self.opts.update:
if self.opts.debug:
print "\t(INF) Updating DJVU page . . .",
# replace the text in the DJVU file
cmd = ['djvused', self.opts.djvu, '-e', 'select %d; remove-txt' % page, "-s"]
out, err = self.command(cmd)
cmd = ['djvused', self.opts.djvu, '-e', 'select %d; set-txt %s'% (page, self.djvu_text), "-s"]
out, err = self.command(cmd)
def process_djvu(self):
if self.opts.debug:
print "(INF) Processing %s" % self.opts.djvu
# calculate DJVU length
def __init__(self, opts):
self.opts = opts
self.temp_img = "/tmp/TESSERACT-OCR-TEMP.tif"
self.temp_ocr = "/tmp/TESSERACT-OCR-TEMP" #tesseract adds .txt
self.ocr_text = self.temp_ocr + '.txt'
# file to dump pase-wise formatted OCR'd text into
self.djvu_text = "/tmp/TESSERACT-OCR-TEMP.djvu.txt"
# file to dump human readable output into for the whole file
if self.opts.output:
output_filename = self.opts.output
else: #dump in /tmp/
output_filename = "/tmp/TESSERACT-OCR-TEMP.output.txt"
self.out_text = open(output_filename, 'w')
if __name__ == "__main__":
# note: structure which works
# print TXTDJVU "(page 0 0 1 1\n" ;
# print TXTDJVU " (line 0 0 1 1 \"toto\")\n" ;
# print TXTDJVU " (line 0 0 1 1 \"toto la la\")\n";
# print TXTDJVU ")\n" ;