Commit ec129b70 authored by Jérome Perrin's avatar Jérome Perrin

OCR: Tesseract 4.1.1 / Ghostscript 9.54.0

With tesseract v4.0.0-beta.3 we often observe crashes with:

```
contains_unichar_id(unichar_id):Error:Assert failed:in file ../../src/ccutil/unicharset.h, line 511
```

This seems to have been fixed by https://github.com/tesseract-ocr/tesseract/pull/1954

Still, even after updating to 4.1.1, text recognition from PDF in ERP5 is too expensive. We also update Ghostscript to 9.54.0, because this version has built-in OCR, which does not need to convert the PDF to PNG then TIFF as we currently do in ERP5.

See merge request nexedi/slapos!985
parents 582b0b03 1b291415
Pipeline #15851 failed with stage
...@@ -2,17 +2,22 @@ ...@@ -2,17 +2,22 @@
extends = extends =
../fontconfig/buildout.cfg ../fontconfig/buildout.cfg
../freetype/buildout.cfg ../freetype/buildout.cfg
../libjpeg/buildout.cfg
../libtiff/buildout.cfg ../libtiff/buildout.cfg
../libxml2/buildout.cfg ../libxml2/buildout.cfg
../pkgconfig/buildout.cfg ../pkgconfig/buildout.cfg
../tesseract/buildout.cfg
../xz-utils/buildout.cfg ../xz-utils/buildout.cfg
parts = ghostscript parts = ghostscript
[ghostscript-common] [ghostscript]
recipe = slapos.recipe.cmmi recipe = slapos.recipe.cmmi
shared = true shared = true
pkg_config_depends = ${libtiff:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends} url = https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9540/ghostscript-9.54.0.tar.gz
md5sum = 5d571792a8eb826c9f618fb69918d9fc
pkg_config_depends = ${libtiff:location}/lib/pkgconfig:${libjpeg:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}
# XXX --with-tessdata work arounds a slaprunner bug of having softwares installed in a path containing //
configure-options = configure-options =
--disable-cups --disable-cups
--disable-threadsafe --disable-threadsafe
...@@ -20,18 +25,18 @@ configure-options = ...@@ -20,18 +25,18 @@ configure-options =
--without-libidn --without-libidn
--without-x --without-x
--with-drivers=FILES --with-drivers=FILES
# it seems that parallel build sometimes fails for ghostscript. --with-tessdata=$(python -c 'print("""${:tessdata-location}""".replace("//", "/"))')
make-options = -j1
environment = environment =
PATH=${pkgconfig:location}/bin:${xz-utils:location}/bin:%(PATH)s PATH=${pkgconfig:location}/bin:${xz-utils:location}/bin:%(PATH)s
PKG_CONFIG_PATH=${:pkg_config_depends} PKG_CONFIG_PATH=${:pkg_config_depends}
LDFLAGS=-Wl,-rpath=${fontconfig:location}/lib -Wl,-rpath=${freetype:location}/lib -Wl,-rpath=${libtiff:location}/lib CFLAGS=-I${libjpeg:location}/include
LDFLAGS=-Wl,-rpath=${fontconfig:location}/lib -Wl,-rpath=${freetype:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib
LD_LIBRARY_PATH=${fontconfig:location}/lib:${freetype:location}/lib:${libtiff:location}/lib:${libxml2:location}/lib LD_LIBRARY_PATH=${fontconfig:location}/lib:${freetype:location}/lib:${libtiff:location}/lib:${libxml2:location}/lib
[ghostscript] # configure gives priority to local jpeg library and refuse mixing local libjpeg with "system" libtiff.
<= ghostscript-9 # We remove this local jpeg library source folder so that configure picks up the slapos versions of these libraries.
pre-configure = rm -r jpeg
[ghostscript-9] post-make-hook = ${tesseract-download-traineddata:post-make-hook}
<= ghostscript-common tessdata-location = @@LOCATION@@/share/tessdata/
url = https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs950/ghostscript-9.50.tar.xz tessdata-urls = ${tesseract-download-traineddata:urls}
md5sum = 6cea6bae4a7cdfac6ccb09f07f0caf8c
...@@ -7,17 +7,14 @@ extends = ...@@ -7,17 +7,14 @@ extends =
../libtiff/buildout.cfg ../libtiff/buildout.cfg
../webp/buildout.cfg ../webp/buildout.cfg
../giflib/buildout.cfg ../giflib/buildout.cfg
../patch/buildout.cfg
[leptonica] [leptonica]
recipe = slapos.recipe.cmmi recipe = slapos.recipe.cmmi
url = http://www.leptonica.com/source/leptonica-1.76.0.tar.gz
md5sum = a263a5e4f7e8f8a661fb121a265d2d20
shared = true shared = true
url = http://www.leptonica.org/source/leptonica-1.80.0.tar.gz
md5sum = d640d684234442a84c9e8902f0b3ff36
configure-options = configure-options =
--disable-static --disable-static
environment = environment =
CPPFLAGS=-I${zlib:location}/include -I${libjpeg:location}/include -I${libpng:location}/include -I${libtiff:location}/include -I${webp:location}/include -I${giflib:location}/include CPPFLAGS=-I${zlib:location}/include -I${libjpeg:location}/include -I${libpng:location}/include -I${libtiff:location}/include -I${webp:location}/include -I${giflib:location}/include
LDFLAGS=-L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib -L${libpng:location}/lib -Wl,-rpath=${libpng:location}/lib -L${libtiff:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${webp:location}/lib -Wl,-rpath=${webp:location}/lib -L${giflib:location}/lib -Wl,-rpath=${giflib:location}/lib LDFLAGS=-L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib -L${libpng:location}/lib -Wl,-rpath=${libpng:location}/lib -L${libtiff:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${webp:location}/lib -Wl,-rpath=${webp:location}/lib -L${giflib:location}/lib -Wl,-rpath=${giflib:location}/lib
PATH=${patch:location}/bin:%(PATH)s
This patch is originally taken from:
http://leptonica.googlecode.com/issues/attachment?aid=560001000&name=zlib-include.patch&token=m2sugSYxB4xwAuNgrKXyHTxBYNg%3A1337345966091
To fix the following issue with leptonica:
http://code.google.com/p/leptonica/issues/detail?id=56
diff -Nurd -x'*~' leptonica-1.68.orig/src/pngio.c leptonica-1.68/src/pngio.c
--- leptonica-1.68.orig/src/pngio.c 2011-02-01 00:41:12.000000000 -0500
+++ leptonica-1.68/src/pngio.c 2011-07-09 09:17:17.000000000 -0400
@@ -108,6 +108,10 @@
#include "png.h"
+#ifdef HAVE_LIBZ
+#include "zlib.h"
+#endif
+
/* ----------------Set defaults for read/write options ----------------- */
/* strip 16 bpp --> 8 bpp on reading png; default is for stripping */
static l_int32 var_PNG_STRIP_16_TO_8 = 1;
...@@ -10,43 +10,34 @@ extends = ...@@ -10,43 +10,34 @@ extends =
../fontconfig/buildout.cfg ../fontconfig/buildout.cfg
../lcms/buildout.cfg ../lcms/buildout.cfg
../pkgconfig/buildout.cfg ../pkgconfig/buildout.cfg
./buildout.hash.cfg
parts = parts =
tesseract tesseract
tesseract-traineddata
tesseract-eng-traineddata
tesseract-osd-traineddata
[tesseract] [tesseract]
recipe = slapos.recipe.cmmi recipe = slapos.recipe.cmmi
url = https://github.com/tesseract-ocr/tesseract/archive/6b250b58121a9858d3e3019a78a6f7d421bd0fc7.tar.gz shared = true
md5sum = fdc38148ad8eb1bd0485a217503dd6d5 url = https://github.com/tesseract-ocr/tesseract/archive/refs/tags/4.1.1.tar.gz
md5sum = 51fe2bcbff1bbce77a25d180fd247f7d
pkg_config_depends = ${leptonica:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}:${lcms2:location}/lib/pkgconfig:${xz-utils:location}/lib/pkgconfig pkg_config_depends = ${leptonica:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}:${lcms2:location}/lib/pkgconfig:${xz-utils:location}/lib/pkgconfig
pre-configure = pre-configure =
autoreconf -ivf -I${pkgconfig:location}/share/aclocal -I${libtool:location}/share/aclocal -Wno-portability autoreconf -ivf -I${pkgconfig:location}/share/aclocal -I${libtool:location}/share/aclocal -Wno-portability
configure-options =
--disable-static
--datarootdir=${tesseract-traineddata:location}
# XXX: tesseract seems not easily configurable at runtime about where to find
# its trained data, so we set its datarootdir above to a controlled location
environment = environment =
PATH=${pkgconfig:location}/bin:${autoconf:location}/bin:${automake:location}/bin:${libtool:location}/bin:${m4:location}/bin:${patch:location}/bin:%(PATH)s PATH=${pkgconfig:location}/bin:${autoconf:location}/bin:${automake:location}/bin:${libtool:location}/bin:${m4:location}/bin:${patch:location}/bin:%(PATH)s
PKG_CONFIG_PATH=${:pkg_config_depends} PKG_CONFIG_PATH=${:pkg_config_depends}
LDFLAGS=-L${leptonica:location}/lib -Wl,-rpath=${leptonica:location}/lib -L${jbigkit:location}/lib -Wl,-rpath=${jbigkit:location}/lib -L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib LDFLAGS=-L${leptonica:location}/lib -Wl,-rpath=${leptonica:location}/lib -L${jbigkit:location}/lib -Wl,-rpath=${jbigkit:location}/lib -L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib
[tesseract-traineddata] post-make-hook = ${tesseract-download-traineddata:post-make-hook}
location = ${buildout:parts-directory}/${:_buildout_section_name_} tessdata-urls = ${tesseract-download-traineddata:urls}
tessdata-location = @@LOCATION@@/share/tessdata/
[tesseract-eng-traineddata]
recipe = slapos.recipe.build:download
destination = ${tesseract-traineddata:location}/tessdata/eng.traineddata
url = https://github.com/tesseract-ocr/tessdata/raw/590567f20dc044f6948a8e2c61afc714c360ad0e/eng.traineddata
md5sum = 57e0df3d84fed9fbf8c7a8e589f8f012
[tesseract-osd-traineddata] [tesseract-download-traineddata]
recipe = slapos.recipe.build:download post-make-hook = ${:_profile_base_location_}/${download-tessdata.py:filename}#${download-tessdata.py:md5sum}:post_make_hook
destination = ${tesseract-traineddata:location}/tessdata/osd.traineddata urls =
url = https://github.com/tesseract-ocr/tessdata/raw/590567f20dc044f6948a8e2c61afc714c360ad0e/osd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/eng.traineddata#57e0df3d84fed9fbf8c7a8e589f8f012
md5sum = 7611737524efd1ce2dde67eff629bbcf https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/osd.traineddata#7611737524efd1ce2dde67eff629bbcf
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/fra.traineddata#a73e70c872f262895d93976febeb1638
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/jpn.traineddata#af3a30a9bec904e106aa8521e7caaeca
https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/chi_sim.traineddata#6965cb3213edd961cb16264e2ea45f5c
[download-tessdata.py]
filename = download-tessdata.py
md5sum = 2d283a6d8662d6bb8c9de7b26162b702
# This is a post-make hook script to download tesseract training data.
#
# This script uses the following buildout options:
# - tessdata-urls: list of URLs and their expected md5sum as URL fragments
# - tessdata-location: path where to install the data.
import zc.buildout
import os
def post_make_hook(options, buildout, env):
if not os.path.exists(options['tessdata-location']):
os.makedirs(options['tessdata-location'])
download = zc.buildout.download.Download(
buildout['buildout'],
hash_name=True,
)
for url in options['tessdata-urls'].splitlines():
url, _, md5sum = url.partition('#')
if url:
download(
url,
md5sum=md5sum,
path=os.path.join(options['tessdata-location'],
os.path.basename(url)),
)
...@@ -7,6 +7,7 @@ extends = ...@@ -7,6 +7,7 @@ extends =
buildout.hash.cfg buildout.hash.cfg
../../component/fonts/buildout.cfg ../../component/fonts/buildout.cfg
../../component/git/buildout.cfg ../../component/git/buildout.cfg
../../component/ghostscript/buildout.cfg
../../component/graphviz/buildout.cfg ../../component/graphviz/buildout.cfg
../../component/gzip/buildout.cfg ../../component/gzip/buildout.cfg
../../component/xz-utils/buildout.cfg ../../component/xz-utils/buildout.cfg
...@@ -65,8 +66,6 @@ parts += ...@@ -65,8 +66,6 @@ parts +=
slapos-cookbook slapos-cookbook
mroonga-mariadb mroonga-mariadb
tesseract tesseract
tesseract-eng-traineddata
tesseract-osd-traineddata
zabbix-agent zabbix-agent
# Buildoutish # Buildoutish
...@@ -252,6 +251,7 @@ link-binary = ...@@ -252,6 +251,7 @@ link-binary =
${graphviz:location}/bin/dot ${graphviz:location}/bin/dot
${grep:location}/bin/grep ${grep:location}/bin/grep
${imagemagick:location}/bin/convert ${imagemagick:location}/bin/convert
${ghostscript:location}/bin/gs
${imagemagick:location}/bin/identify ${imagemagick:location}/bin/identify
${jpegoptim:location}/bin/jpegoptim ${jpegoptim:location}/bin/jpegoptim
${jsl:location}/bin/jsl ${jsl:location}/bin/jsl
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment