#!/usr/bin/python # -*- coding: utf-8 -*- # # bwscan2pdf V 0.4 # Copyright (c) by Thomas Lauckner, 2009 # # This work is licensed cc-by-sa # See http://creativecommons.org/licenses/by-sa/3.0/ # # Changelog: # # V 0.4 The script uses temporary files now. # V 0.3 Changed gimp script from threshold to photocopy for better output # V 0.2 tesseract option included # V 0.1 Initial release # # TODO: # * Progress bar "#" -> "#######" # * Confirm the input files are images (in the right format???) # * Be flexible: Use any image format "convert" accepts # * Optimize for speed import sys import os import getopt import tempfile # We have to use a temporary directory # since the output of tesseract cannot be controlled. # => Put all output in this temporary directory and everything is fine :) TEMPDIR=tempfile.mkdtemp(prefix="bwscan") def usage(): print "Usage: %s -s -d -o -e -n -r -t files" % sys.argv[0] print "Usage: %s --single --double --odd --even --normal --reverse --tesseract files" % sys.argv[0] def defaults(): single="TRUE" tesseract="TRUE" return (single,tesseract) def not_implemented(): print "This option is not implemented yet." def merge(file1,file2): file(file2,'a').write("\n"+file(file1,'r').read()) try: opts, args = getopt.getopt(sys.argv[1:], "sdoernt", ["single", "double", "odd", "even", "reverse", "normal", "tesseract"]) except getopt.GetoptError: usage() sys.exit(2) def tempFilename(suffixstr=""): (openedfd,name)=tempfile.mkstemp(suffix=suffixstr,prefix="bwscan",dir=TEMPDIR) os.close(openedfd) return name single="NONE" even="NONE" reverse="NONE" tesseract="NONE" for opt,arg in opts: if opt in ("-s", "--single"): if single=="NONE": single="TRUE" else: usage() sys.exit(2) elif opt in ("-d", "--double"): if single=="NONE": single="FALSE" else: usage() sys.exit(2) elif opt in ("-o", "--odd"): if even=="NONE": even="FALSE" else: usage() sys.exit(2) elif opt in ("-e", "--even"): if even=="NONE": even="TRUE" else: usage() sys.exit(2) elif opt in ("-r", "--reverse"): if reverse=="NONE": reverse="TRUE" else: usage() sys.exit(2) elif opt in ("-n", "--normal"): if reverse=="NONE": reverse="FALSE" else: usage() sys.exit(2) if opt in ("-t", "--tesseract"): tesseract="TRUE" # Fall back to default if single=="NONE": print "Option 'single/double' is not set. Falling back to default." (single,tesseract)=defaults() if single=="TRUE": reverse="FALSE" if reverse=="NONE": usage() sys.exit(2) elif reverse=="TRUE" and even=="NONE": usage() sys.exit(2) filenames_in=args filenames_sorted=[] filenames_temp_bwpng={} filenames_temp_pdf={} filenames_temp_tif={} temp_final_textfile=tempFilename(".txt") temp_final_pdf=tempFilename(".pdf") if single=="TRUE": filenames_sorted=filenames_in elif reverse=="FALSE": # Sort files, assuming that the odd pages were scanned first odd_pages,rem=divmod(len(filenames_in),2) odd_pages+=rem for oddindex in range(odd_pages): filenames_sorted.append(filenames_in[oddindex]) if (oddindex+odd_pages < len(filenames_in)): filenames_sorted.append(filenames_in[oddindex+odd_pages]) elif reverse=="TRUE": odd_pages,rem=divmod(len(filenames_in),2) if rem==1: print "The number of files needs to be be even." sys.exit(2) for oddindex in range(odd_pages): filenames_sorted.append(filenames_in[oddindex]) if even=="FALSE" and oddindex==odd_pages-1: continue else: filenames_sorted.append(filenames_in[len(filenames_in)-oddindex-1]) # Convert images to b/w pdf using gimp and convert # via gimp-threshold #gimpscript=''' #gimp -i -d -f -b '(define (threshold infile outfile) (let* ((image (car (gimp-file-load RUN-NONINTERACTIVE infile infile))) (drawable (car (gimp-image-get-active-layer image)))) (gimp-threshold drawable 200 255) (set! drawable (car (gimp-image-get-active-layer image))) (gimp-file-save RUN-NONINTERACTIVE image drawable outfile outfile) (gimp-image-delete image))) (threshold "%s" "%s") (gimp-quit 0)' ''' # via plug-in-photocopy gimpscript=''' gimp -i -d -f -b '(define (photocopy infile outfile) (let* ((image (car (gimp-file-load RUN-NONINTERACTIVE infile infile))) (drawable (car (gimp-image-get-active-layer image)))) (plug-in-photocopy RUN-NONINTERACTIVE image drawable 50 0.8 .2 .2) (set! drawable (car (gimp-image-get-active-layer image))) (gimp-file-save RUN-NONINTERACTIVE image drawable outfile outfile) (gimp-image-delete image))) (photocopy "%s" "%s") (gimp-quit 0)' ''' os.system("gimp -i -d -f -b '(gimp-quit 0)'") for filename in filenames_sorted: filenames_temp_bwpng[filename]=tempFilename(".png") filenames_temp_pdf[filename]=tempFilename(".pdf") filenames_temp_tif[filename]=tempFilename(".tif") # print "Converting %s to b/w..." % filename os.system(gimpscript % (filename,filenames_temp_bwpng[filename])) # print "Converting %s to pdf..." % filename os.system("convert %s %s" % (filenames_temp_bwpng[filename],filenames_temp_pdf[filename])) if tesseract=="TRUE": # print "Converting %s for tesseract use..." % filename os.system("convert %s %s" % (filename,filenames_temp_tif[filename])) # print "Using tesseract OCR on %s..." % filename os.system("tesseract %s %s -l deu" % (filenames_temp_tif[filename],filenames_temp_tif[filename])) print "" # This is to end the "#"-Line (see at the beginning of the big loop) # Produce a single pdf file using pdftk filenames_sorted_string="" for filename in filenames_sorted: filenames_sorted_string=filenames_sorted_string+filenames_temp_pdf[filename]+" " if os.path.exists("bwscan.pdf"): stop=1 answer=raw_input("bwscan.pdf exists. Remove? [y/n] ") if (answer=="y"): os.remove("bwscan.pdf") stop=0 else: stop=0 if stop==0: if tesseract=="TRUE": # print "Producing bwscan.txt..." for filename in filenames_in: merge(filenames_temp_tif[filename]+".txt",temp_final_textfile) # print "Producing bwscan.pdf..." os.system("pdftk %s cat output %s" % (filenames_sorted_string,temp_final_pdf)) if tesseract=="TRUE": # print "Attaching bwscan.txt to bwscan.pdf..." os.system("pdftk %s attach_files %s to_page 1 output bwscan.pdf" % (temp_final_pdf,temp_final_textfile)) else: # This has a reason: os.rename() does not work, if the files are on different partitions # (e.g. /tmp and /home) os.system("pdftk %s cat output bwscan.pdf") answer=raw_input("Should I remove the original files? [y/n] ") if (answer=="y"): for filename in filenames_in: print "Removing %s" % filename os.remove(filename) #print "Removing garbage..." for filename,tempfile in filenames_temp_bwpng.items(): os.remove(tempfile) for filename,tempfile in filenames_temp_pdf.items(): os.remove(tempfile) for filename,tempfile in filenames_temp_tif.items(): os.remove(tempfile) if tesseract=="TRUE": os.remove(tempfile+".txt") os.remove(temp_final_textfile) os.remove(temp_final_pdf) os.rmdir(TEMPDIR)