Ubuntu Pastebin

Paste from Hinnerk at Thu, 10 Mar 2016 19:46:56 +0000

Download as text
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
#
# Tool Home
home="/home/shared/90 Ressourcen/Tools/OCR"

# Prefix for output files
pre="OCR"
 
# Folder for saving original scans.
org="./Originals"
 
# Folder for saving finished ocr pdfs.
final="./Final"
 
# Folder temprorary files
tmp="./Temp"
 
# convert options
co="-density 300x300"
# co="-normalize -density 300x300 -depth 8"
 
# unpaper options
uo="--no-blurfilter --overwrite --no-grayfilter"

# tesseract options
tesso="-l deu pdf"
 
# tesseract additional user words
tessw=""
# tessw="./Config/userwords"
 
# tag list file
tags="./Config/TagList.txt"
 
# Sanitize
if [ ! -d "$org" ]; then
        echo "Target directory for original files does not exist: $org"
        exit 1
fi
if [ ! -d "$final" ]; then
        echo "Target directory for final files does not exist: $final"
        exit 1
fi
if [ "$tessw" = "" ]; then
        echo "tesseract: No file for additional words chosen."
else
        if [ -e "$tessw" ]; then
                echo "tesseract: File with additional word list not found: $tessw"
                exit 1
		else 
				tesso="--user-words $tessw $tesso"
        fi
fi
if [ ! "$tags" = "" ]; then
        echo "tags: No file as tag list chosen."
else
        if [ -e "$tags"]; then
                echo "tesseract: File with additional word list not found: $tags"
                exit 1
        fi
fi
 
# for each file:
for f in $( ls ./*.pdf); do
f=${f##*/}

# Number of Pages in original PDF 
p=`pdftk $f dump_data | grep NumberOfPages | sed "s/[^0-9]*//"`
# echo "Page Count: $p"
 
# 1. copy file to "Original" folder.
# echo "Saving original..."
# echo `cp -f -p "$f" "$org"`

# 2. convert file to pbm.
echo "Converting. Options: $co"
tiff="${f%%.*}_%d.pbm"
echo "convert $co $f $tmp/$tiff"
# echo `convert $co "$f" "$tmp/co_$tiff"`

# 3. unpaper.
# echo `unpaper "$uo" $tmp/co_${f%%.*}_%d.pbm $tmp/up_${f%%.*}_%d.pbm`
i=0
while [ $i -lt $p ]; do
	# pbm="${f%%.*}%${p}d.pbm"
	# echo "Unpaper. Options: $uo"
	# echo "Unpaper. File: $tiff"
	# echo "Unpaper. Target: $pbm"
	echo "unpaper "$uo" "$tmp/co_${f%%.*}_${i}.pbm" "$tmp/up_${f%%.*}_${i}.pbm""
	echo `unpaper "$uo" "$tmp/co_${f%%.*}_${i}.pbm" "$tmp/up_${f%%.*}_${i}.pbm"`
	let i=i+1
done


# 4. tesseract.
i=0
while [ $i -lt $p ]; do
	pdf="${pre}_$(date +%Y%m%d)_${f%%.*}_$i"
	#echo "Tesseract. Options: $tesso"
	#echo "tesseract "$tmp/up_${f%%.*}_${i}.pbm" "$tmp/$pdf" $tesso &"
	# echo `tesseract "$tmp/up_${f%%.*}_${i}.pbm" "$tmp/$pdf" $tesso &`
	let i=i+1
done

# 5. add "Original Scan Date"
# 6. add "OCR Date"
# 7. check text vs tag file.
# 8. add any identified tags.
# 9. rename file:
#                               "S"
#       Original Scan Date: "YYYYMMDD_"
#                               Company Name: "UNKOWN_"
#                               Tags: list of tags, separated with _
# 10. move output file to finished folder
# 11. remove all intermediate files
 
done
# chmod +777 -R -f "$final"
# chmod +777 -R -f "$org"
 
Download as text