#!/bin/bash

### Inspired by script pdf-trim-to-djvu.sh (http://gist.github.com/315791)
### PUBLIC DOMAIN

### Settings
# default
DPI=300
treshold=129
usemini=0
usecodjvu=0
ag=0
usepro="-contrast -blur 0x1"
ocrengine=""
ocrlanguage=""
verbmini=0
tmp=0
ocrjobs=1
# internal
prog="$0"
dpidefault="$DPI"
trdefault="$treshold"
midefault="$usemini"
codefault="$usecodjvu"
agdefault="$ag"
prodefault="$usepro"
ocrenginedefault="$ocrengine"
ocrlanguagedefault="$ocrlanguage"
verbminidefault="$verbmini"
tmpdefault="$tmp"
ocrjobsdefault="$ocrjobs"

function printversion() {
printf "img2djvu version 20151127\n"
}

function usage() {
  me=`basename "$prog"`
  cat << END
Usage: "$me" [options] relative_folder_name

Options:
 -a <0|1|2> aggressivity: 0 is not aggressive, 1 is aggressive, 2 is very aggressive [default: "$agdefault"]
 -c <0|1>   make a choice of temporary directory, 0 for /tmp, 1 for current [default: "$tmpdefault"]
 -d <int>   resolution in DPI [default: "$dpidefault"]
 -e "str"   if not empty, use OCR engine (supported by ocrodjvu) with this name [default: "$ocrenginedefault"]
 -j <int>   number of OCR jobs [default: "$ocrjobsdefault"]
 -l <int>   if not 0, will use forced segmentation (with <int> downsampling) [default: "$codefault"]
 -m <int>   if not 0, will use minidjvu (with <int> dictionary size) instead of cjb2 [default: "$midefault"]
 -r "str"   if not empty, use OCR engine with given language [default: "$ocrlanguagedefault"]
 -p "str"   process color layer with given ImageMagick options [default: "$prodefault"]
 -t <int>   if not 0, will use cpaldjvu for all images with number of colors less than <int> [default: "$trdefault"]
 -v <0|1>   if not 0, minidjvu run will be verbose [default: "$verbminidefault"]
 -h|--help  print this message
 -V         print version and exit
END
}

opts=`getopt -l "help" "a:c:d:e:f:j:l:m:p:r:t:v:hV" "$@"` && eval set -- "$opts"
while true ; do
  case "$1" in
    -a) ag="$2" ; shift 2 ;;
    -c) tmp="$2" ; shift 2 ;;
    -d) DPI="$2" ; shift 2 ;;
    -e) ocrengine="$2"; shift 2 ;;
    -j) ocrjobs="$2"; shift 2 ;;
    -l) usecodjvu="$2" ; shift 2 ;;
    -m) usemini="$2" ; shift 2 ;;
    -r) ocrlanguage="$2" ; shift 2 ;;
    -p) usepro="$2" ; shift 2 ;;
    -t) treshold="$2" ; shift 2 ;;
    -v) verbmini="$2" ; shift 2 ;;
    -h|--help) usage ; exit 0 ;;
    -V) printversion ; exit 0 ;;
    --) shift ; break ;;
    -*) break ;;
  esac
done

### Checks
if [ $# -ne 1 ] ; then
  printf "\nFolder name is required.\n\n"
  usage
  exit 1
fi

if [ "`echo "$1" | sed -e 's|^\(.\).*$|\1|'`" = "/" ] ; then
  printf "\nAbsolute folder names are not allowed.\n\n"
  usage
  exit 1
fi


if [ ! -d "$1" ] ; then
   printf "\n$1 is not a folder.\n\n"
   usage
   exit 1
fi

if ! which cpaldjvu >/dev/null ; then
   printf "\nDjVu Libre should be installed.\n\n"
   usage
   exit 1
else
   dv=`cpaldjvu 2>&1| sed -e '2,$d' -e 's/^.*-\([1-9.]*\)$/\1/' -e 's/\.//g'`
fi

if ! which identify >/dev/null ; then
   printf "\nImageMagick should be installed.\n\n"
   usage
   exit 1
fi

if ! [[ -n "$ocrengine" && -n "$ocrlanguage" ]] ; then
   useocr=0
   if [[ -n "$ocrengine" || -n "$ocrlanguage" ]] ; then
      printf "\nFor OCR, both OCR options (-e and -r) should be specified.\n\n"
      usage
      exit 1
   fi
else
   if ! which ocrodjvu >/dev/null ; then
      printf "\nFor OCR, ocrodjvu should be installed.\n\n"
      usage
      exit 1
   fi
   useocr=1
fi

### Aggressivity options
case "$ag" in
   0)
      ags="<"
      age=">"
      c44opt=""
      cjb2opt=""
      cpaldjvuopt="-bgwhite"
      minidjvuopt=""
   ;;
   1)
      ags="<<"
      age=">>"
      c44opt="-slice 74+13+10"
      cjb2opt="-clean"
      cpaldjvuopt=""
      minidjvuopt="--clean"
   ;;
   2)
      ags="<<<"
      age=">>>"
      c44opt="-slice 76+15"
      cjb2opt="-lossy"
      cpaldjvuopt=""
      minidjvuopt="--lossy --aggression 200"
   ;;
   *)
   printf "\nAggressivity has only three possible values: 0, 1, or 2.\n\n"
   usage
   exit 1
esac

### General coders functions

function colorcoder {
if [ "$usecodjvu" -lt 1 ] ; then
   c44coder "$1" "$2"
else
   codjvucoder "$1" "$2"
fi
}

### Subcoders functions

### Black and white
function cjb2coder {
   cjb2 $cjb2opt -dpi "$DPI" "$1" "$2"
   # This function does not print because it is called also from inside codjvucoder
}

function minidjvucoder {
   if [ "$verbmini" -gt 0 ] ; then
      minidjvu -d "$DPI" -p "$usemini" $minidjvuopt -v $@
   else
      minidjvu -d "$DPI" -p "$usemini" $minidjvuopt $@
   fi
   printf "$ags""M:"
   printf "$bwcount"
}

### Color
function cpaldjvucoder {
   cpaldjvu $cpaldjvuopt -dpi "$DPI" "$1" "$2"
   printf "F"
}

function c44coder {
   c44 $c44opt -dpi "$DPI" "$1" "$2"
   printf "C"
}

# Layer separation and "forced segmentation" coder
function codjvucoder {
   # folders
   printf "L"
   tmcdir="$tmpdir"/tmp
   mkdir "$tmcdir"
   # image parameters
   newsize=`identify -format "%[fx:ceil(w/$usecodjvu)]x%[fx:ceil(h/$usecodjvu)]" "$1"`
   # separate layers from Scan Tailor
   convert "$1" -threshold 1 "$tmcdir/fore.pbm"
   convert "$1" -fill white -opaque black $usepro -resize $newsize\! "$tmcdir/back.ppm"
   cd "$tmcdir"
   printf ":"
   # make BG44 chunk
   c44 $c44opt -dpi "$DPI" back.ppm back.djvu
   djvuextract back.djvu BG44=bg44.cnk >/dev/null 2>&1
   printf "1"
   # make Sjbz chunk
   cjb2coder fore.pbm sjbz.djvu
   djvuextract sjbz.djvu Sjbz=sjbz.cnk >/dev/null 2>&1
   printf "+1"
   if [ "$dv" -lt 3523 ] ; then
      # make FG44 chunk from background (!)
      convert -size $newsize\! -depth 8 xc:black fore.pgm
      c44 -dpi "$DPI" -slice 120 fore.pgm fore.djvu
      djvuextract fore.djvu BG44=fg44.cnk >/dev/null 2>&1
      # make compound DjVu
      djvumake "$2" INFO=,,"$DPI" Sjbz=sjbz.cnk FG44=fg44.cnk BG44=bg44.cnk
      else
      # use new ability of djvumake to create FGbz chunk on the fly
      djvumake "$2" INFO=,,"$DPI" Sjbz=sjbz.cnk FGbz="#black" BG44=bg44.cnk
   fi
   printf "+1"
   cd ..
   rm -rf "$tmcdir"
}

### General coder and bundler
# convert all pages to pnm: convert
# determine color depth of each page
# if depth <= 1, compress with cjb2 or minidjvu
# if depth > 1 and treshold > 0, count number of unique colors
# if unique colors < 129, compress with cpaldjvu
# if unique colors >= 129, compress with c44 or codjvu
# display progress symbols (B or M for Bitonal, C for TrueColor, L for codjvu layers, F for Palette)
# create bundled DjVu file
function nomini {
( cd "$fld" &&
  ( # for every page
    SAVEIFS=$IFS
    IFS=$(echo -en "\n\b")
    for f in `ls -1p | egrep -v '/$'` ; do
    IFS=$SAVEIFS
      of="$tmpdir/${f%.*}.pnm"
      if which tifftopnm >/dev/null ; then
       tifftopnm_cmd=1
      fi
      if [[ "$fld/$f" =~ .*\.(tif|tiff) ]] ; then # Use `tifftopnm` for TIFFs, if installed (it is faster / uses less memory).
       if [ $tifftopnm_cmd = 1 ] ; then
        tifftopnm "$fld/$f" > "$of" 2>/dev/null
       else
        convert "$fld/$f" "$of" 2>/dev/null
       fi
       else # for all other images (JPEG etc), use `convert` :
        convert "$fld/$f" "$of" 2>/dev/null
      fi
      convertfailed=$?
      if [ "$convertfailed" -eq 1 ] ; then
         printf "."
         continue
      fi
      printf "$ags" && \
      ( # switch between formats
        if [ `identify -format "%z" "$of"` -gt 1 ] ; then
           if [ "$treshold" -gt 0 ] ; then
              colors=`identify -format "%k" "$of"` && \
              if [ "$colors" -lt "$treshold" ] ; then
                 cpaldjvucoder "$of" "${of%pnm}djvu"
              else
                 colorcoder "$of" "${of%pnm}djvu"
              fi
          else
              colorcoder "$of" "${of%pnm}djvu"
          fi
        else
        cjb2coder "$of" "${of%pnm}djvu"
        printf "B"
        fi
      ) && \
      rm "$of" && \
      printf "$age"
      IFS=$(echo -en "\n\b")
    done
    IFS=$SAVEIFS
  ) &&
  cd "$tmpdir" && \
  djvm -c merged.djvu *.djvu && \
  mv merged.djvu "$djvu" && \
  printf "\nDone.\n" && \
  if [ "$useocr" -gt 0 ] ; then
     printf "Starting OCR...\n"
     ocrodjvu --engine "$ocrengine" --language "$ocrlanguage" --jobs "$ocrjobs" --in-place --on-error=resume "$djvu"
  fi
) && rm -rf "$tmpdir" || (
  printf "Failure\nTemporary directory left: %s\n" "$tmpdir"
  exit 2
)
}

### Minidjvu-based coder and bundler
# Similar to previous, but instead of cjb2 minidjvu called every time when black and white sequence interrupted with color image, or when sequence ends on black and white file
# Works with sequences, therefore visually less verbose, minidjvu is also slower than cjb2
function mini {
( cd "$fld" &&
  ( bwcount=0 && \
    bwpages="" && \
    SAVEIFS=$IFS
    IFS=$(echo -en "\n\b")
    for f in `ls -1p | egrep -v '/$'` ; do
    IFS=$SAVEIFS
      pgcount=$((pgcount-1))
      of="$tmpdir/${f%.*}.pnm"
      if which tifftopnm >/dev/null ; then
       tifftopnm_cmd=1
      fi
      if [[ "$fld/$f" =~ .*\.(tif|tiff) ]] ; then # Use `tifftopnm` for TIFFs, if installed (it is faster / uses less memory).
       if [ $tifftopnm_cmd = 1 ] ; then
        tifftopnm "$fld/$f" > "$of" 2>/dev/null
       else
        convert "$fld/$f" "$of" 2>/dev/null
       fi
       else # for all other images (JPEG etc), use `convert` :
        convert "$fld/$f" "$of" 2>/dev/null
      fi
      convertfailed=$?
      if [ "$convertfailed" -eq 1 ] ; then
         printf "."
         continue
      fi
      if [ `identify -format "%z" "$of"` -gt 1 ] ; then
         if [ "$bwcount" -gt 0 ] ; then
            minidjvucoder "$bwpages" "${of%pnm}1.djvu"
            rm -f $bwpages
            bwpages=""
            bwcount=0
            printf "$age"
         fi
         printf "$ags"
         if [ "$treshold" -gt 0 ] ; then
            colors=`identify -format "%k" "$of"`
            if [ "$colors" -lt "$treshold" ] ; then
               cpaldjvucoder "$of" "${of%pnm}2.djvu"
            else
               colorcoder "$of" "${of%pnm}2.djvu"
            fi
         else
            colorcoder "$of" "${of%pnm}2.djvu"
         fi
         rm "$of" && \
         printf "$age"
      else
         bwpages="$bwpages $of"
         bwcount=$((bwcount+1))
      fi
         if [[ "$pgcount" -eq 0 && "$bwcount" -gt 0 ]] ; then
            minidjvucoder "$bwpages" "${of%pnm}1.djvu"
            rm -f $bwpages
            bwpages=""
            bwcount=0
            printf "$age"
         fi
      IFS=$(echo -en "\n\b")
    done
    IFS=$SAVEIFS
  ) &&
  cd "$tmpdir" && \
  djvm -c "$djvu" *.djvu && \
  printf "\nDone.\n" && \
  if [ "$useocr" -gt 0 ] ; then
     printf "Starting OCR...\n"
     ocrodjvu --engine "$ocrengine" --language "$ocrlanguage" --jobs "$ocrjobs" --in-place --on-error=resume "$djvu"
  fi
) && rm -rf "$tmpdir" || (
  printf "Failure\nTemporary directory left: %s\n" "$tmpdir"
  exit 2
)
}

# Absolute paths to input folder and output DjVu files
fld="`pwd`/$1"
djvu="`pwd`/`echo $1 | sed -e 's|/$||g'`.djvu"
if [ "$tmp" -eq 0 ]; then
   tmpdirprefix="/tmp"
else
   tmpdirprefix=`pwd`
fi

### START!
pgcount=`ls -1p "$fld" | egrep -v '/$' | wc -l`
printf "$pgcount files:\n"
if [ "$usemini" -lt 1 ] ; then
   tmpdir=`mktemp -d "$tmpdirprefix"/pagesXXXXXX`
   nomini
else
   if ! which minidjvu >/dev/null ; then
      printf "\nFor -m option, minidjvu is required.\n\n"
      usage
      exit 1
   fi
   if ls -1p "$fld" | grep " " ; then
      printf "\nSome file names contain spaces, minidjvu will not run!\n"
      exit 1
   fi
   if [ "$tmp" -eq 1 ] ; then
      if echo "$fld" | grep " " ; then
         printf "\nPath to temporary directory contain spaces, minidjvu will not run!\n"
         exit 1
      fi
   fi
   tmpdir=`mktemp -d "$tmpdirprefix"/pagesXXXXXX`
   mini
fi
