1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120 | #!/bin/bash
#
# Tool Home
# home="/home/shared/90 Ressourcen/Tools/OCR"
# Prefix for output files
pre="OCR"
# Folder for saving original scans.
org="./Originals"
# Folder for saving finished ocr pdfs.
final="./Final"
# Folder temprorary files
tmp="./Temp"
# convert options
co="-density 300x300"
# co="-normalize -density 300x300 -depth 8"
# unpaper options
uo="--no-blurfilter --overwrite --no-grayfilter"
# tesseract options
tesso="-l deu pdf"
# tesseract additional user words
tessw=""
# tessw="./Config/userwords"
# tag list file
tags="./Config/TagList.txt"
# Sanitize
if [ ! -d "$org" ]; then
echo "Target directory for original files does not exist: $org"
exit 1
fi
if [ ! -d "$final" ]; then
echo "Target directory for final files does not exist: $final"
exit 1
fi
if [ "$tessw" = "" ]; then
echo "tesseract: No file for additional words chosen."
else
if [ -e "$tessw" ]; then
echo "tesseract: File with additional word list not found: $tessw"
exit 1
else
tesso="--user-words $tessw $tesso"
fi
fi
if [ ! "$tags" = "" ]; then
echo "tags: No file as tag list chosen."
else
if [ -e "$tags" ]; then
echo "tesseract: File with additional word list not found: $tags"
exit 1
fi
fi
# for each file:
for f in $( ls ./*.pdf); do
f=${f##*/}
# Number of Pages in original PDF
p=$(pdftk "$f" dump_data | grep NumberOfPages | sed "s/[^0-9]*//")
# echo "Page Count: $p"
# 1. copy file to "Original" folder.
# echo "Saving original..."
# echo `cp -f -p "$f" "$org"`
# 2. convert file to pbm.
echo "Converting. Options: $co"
tiff="${f%%.*}_%d.pbm"
echo "convert $co $f $tmp/$tiff"
# echo `convert $co "$f" "$tmp/co_$tiff"`
# 3. unpaper.
# echo `unpaper "$uo" $tmp/co_${f%%.*}_%d.pbm $tmp/up_${f%%.*}_%d.pbm`
i=0
while [ $i -lt "$p" ]; do
# pbm="${f%%.*}%${p}d.pbm"
# echo "Unpaper. Options: $uo"
# echo "Unpaper. File: $tiff"
# echo "Unpaper. Target: $pbm"
# echo "unpaper "$uo" "$tmp/co_${f%%.*}_${i}.pbm" "$tmp/up_${f%%.*}_${i}.pbm""
$("unpaper $uo $tmp/co_${f%%.*}_${i}.pbm $tmp/up_${f%%.*}_${i}.pbm")
let i=i+1
done
# 4. tesseract.
i=0
while [ $i -lt "$p" ]; do
# pdf="${pre}_$(date +%Y%m%d)_${f%%.*}_$i"
#echo "Tesseract. Options: $tesso"
#echo "tesseract "$tmp/up_${f%%.*}_${i}.pbm" "$tmp/$pdf" $tesso &"
# echo `tesseract "$tmp/up_${f%%.*}_${i}.pbm" "$tmp/$pdf" $tesso &`
let i=i+1
done
# 5. add "Original Scan Date"
# 6. add "OCR Date"
# 7. check text vs tag file.
# 8. add any identified tags.
# 9. rename file:
# "S"
# Original Scan Date: "YYYYMMDD_"
# Company Name: "UNKOWN_"
# Tags: list of tags, separated with _
# 10. move output file to finished folder
# 11. remove all intermediate files
done
# chmod +777 -R -f "$final"
# chmod +777 -R -f "$org"
|