tesseract: add a wrapper to setup languages
Tesseract is now decoupled from the tessdata language corpus. This avoids recompilation when building Tesseract with a custom set of languages. Update k2pdfopt to use the new wrapper interface.
This commit is contained in:
parent
45d2a2dd91
commit
aaaed13077
@ -1,11 +1,18 @@
|
||||
{ callPackage, lowPrio }:
|
||||
|
||||
let
|
||||
tesseract3 = callPackage ./tesseract3.nix {};
|
||||
tesseract4 = callPackage ./tesseract4.nix {};
|
||||
base3 = callPackage ./tesseract3.nix {};
|
||||
base4 = callPackage ./tesseract4.nix {};
|
||||
languages = callPackage ./languages.nix {};
|
||||
in
|
||||
{
|
||||
tesseract = tesseract3;
|
||||
tesseract = callPackage ./wrapper.nix {
|
||||
tesseractBase = base3;
|
||||
languages = languages.v3;
|
||||
};
|
||||
|
||||
tesseract_4 = lowPrio tesseract4;
|
||||
tesseract_4 = lowPrio (callPackage ./wrapper.nix {
|
||||
tesseractBase = base4;
|
||||
languages = languages.v4;
|
||||
});
|
||||
}
|
||||
|
43
pkgs/applications/graphics/tesseract/languages.nix
Normal file
43
pkgs/applications/graphics/tesseract/languages.nix
Normal file
@ -0,0 +1,43 @@
|
||||
{ stdenv, lib, fetchurl, fetchFromGitHub }:
|
||||
|
||||
rec {
|
||||
makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }:
|
||||
let
|
||||
tessdataSrc = fetchFromGitHub {
|
||||
owner = "tesseract-ocr";
|
||||
repo = "tessdata";
|
||||
rev = tessdataRev;
|
||||
sha256 = tessdata;
|
||||
};
|
||||
|
||||
languageFile = lang: sha256: fetchurl {
|
||||
url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata";
|
||||
inherit sha256;
|
||||
};
|
||||
in
|
||||
{
|
||||
all = stdenv.mkDerivation {
|
||||
name = "all";
|
||||
buildCommand = ''
|
||||
mkdir $out
|
||||
cd ${tessdataSrc}
|
||||
cp *.traineddata $out
|
||||
'';
|
||||
outputHashMode = "recursive";
|
||||
outputHashAlgo = "sha256";
|
||||
outputHash = all;
|
||||
};
|
||||
};
|
||||
|
||||
v3 = makeLanguages {
|
||||
tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
||||
tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
||||
all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn";
|
||||
};
|
||||
|
||||
v4 = makeLanguages {
|
||||
tessdataRev = "4.0.0";
|
||||
tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
|
||||
all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg";
|
||||
};
|
||||
}
|
@ -1,37 +1,5 @@
|
||||
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
|
||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers
|
||||
# Supported list of languages or `null' for all available languages
|
||||
, enableLanguages ? null
|
||||
# if you want just a specific list of languages, optionally specify a hash
|
||||
# to make tessdata a fixed output derivation.
|
||||
, enableLanguagesHash ? (if enableLanguages == null # all languages
|
||||
then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
|
||||
else null)
|
||||
}:
|
||||
|
||||
let tessdata = stdenv.mkDerivation ({
|
||||
name = "tessdata";
|
||||
src = fetchFromGitHub {
|
||||
owner = "tesseract-ocr";
|
||||
repo = "tessdata";
|
||||
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
||||
# when updating don't forget to update the default value fo enableLanguagesHash
|
||||
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
||||
};
|
||||
buildCommand = ''
|
||||
cd $src;
|
||||
for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
|
||||
install -Dt $out/share/tessdata $src/$lang ;
|
||||
done;
|
||||
'';
|
||||
preferLocalBuild = true;
|
||||
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
|
||||
# when a hash is given, we make this a fixed output derivation.
|
||||
outputHashMode = "recursive";
|
||||
outputHashAlgo = "sha256";
|
||||
outputHash = enableLanguagesHash;
|
||||
}));
|
||||
in
|
||||
{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig
|
||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
name = "tesseract-${version}";
|
||||
@ -51,17 +19,11 @@ stdenv.mkDerivation rec {
|
||||
|
||||
LIBLEPT_HEADERSDIR = "${leptonica}/include";
|
||||
|
||||
postInstall = ''
|
||||
for i in ${tessdata}/share/tessdata/*; do
|
||||
ln -s $i $out/share/tessdata;
|
||||
done
|
||||
'';
|
||||
|
||||
meta = {
|
||||
description = "OCR engine";
|
||||
homepage = https://github.com/tesseract-ocr/tesseract;
|
||||
license = stdenv.lib.licenses.asl20;
|
||||
maintainers = with stdenv.lib.maintainers; [viric];
|
||||
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
|
||||
platforms = with stdenv.lib.platforms; linux ++ darwin;
|
||||
};
|
||||
}
|
||||
|
@ -1,9 +1,5 @@
|
||||
{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
|
||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers
|
||||
|
||||
# Supported list of languages or `null' for all available languages
|
||||
, enableLanguages ? null
|
||||
}:
|
||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
name = "tesseract-${version}";
|
||||
@ -16,46 +12,16 @@ stdenv.mkDerivation rec {
|
||||
sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
|
||||
};
|
||||
|
||||
tessdata = fetchFromGitHub {
|
||||
owner = "tesseract-ocr";
|
||||
repo = "tessdata";
|
||||
rev = version;
|
||||
sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
|
||||
};
|
||||
enableParallelBuilding = true;
|
||||
|
||||
nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
|
||||
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
|
||||
|
||||
# Copy the .traineddata files of the languages specified in enableLanguages
|
||||
# into `$out/share/tessdata' and check afterwards if copying was successful.
|
||||
postInstall = let
|
||||
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
|
||||
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
|
||||
findLangArgs = if enableLanguages != null
|
||||
then "\\( ${mkFindArgs enableLanguages} \\)"
|
||||
else "-iname '*.traineddata'";
|
||||
in ''
|
||||
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
|
||||
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
|
||||
|
||||
${if enableLanguages != null then ''
|
||||
expected=${toString (builtins.length enableLanguages)}
|
||||
'' else ''
|
||||
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
|
||||
''}
|
||||
|
||||
if [ "$numLangs" -ne "$expected" ]; then
|
||||
echo "Expected $expected languages, but $numLangs" \
|
||||
"were copied to \`$out/share/tessdata'" >&2
|
||||
exit 1
|
||||
fi
|
||||
'';
|
||||
|
||||
meta = {
|
||||
description = "OCR engine";
|
||||
homepage = https://github.com/tesseract-ocr/tesseract;
|
||||
license = stdenv.lib.licenses.asl20;
|
||||
maintainers = with stdenv.lib.maintainers; [viric];
|
||||
platforms = with stdenv.lib.platforms; linux;
|
||||
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
|
||||
platforms = with stdenv.lib.platforms; linux ++ darwin;
|
||||
};
|
||||
}
|
||||
|
69
pkgs/applications/graphics/tesseract/wrapper.nix
Normal file
69
pkgs/applications/graphics/tesseract/wrapper.nix
Normal file
@ -0,0 +1,69 @@
|
||||
{ stdenv, makeWrapper, tesseractBase, languages
|
||||
|
||||
# A list of languages like [ "eng" "spa" … ] or `null` for all available languages
|
||||
, enableLanguages ? null
|
||||
|
||||
# A list of files or a directory containing files
|
||||
, tessdata ? (
|
||||
if enableLanguages == null then
|
||||
languages.all
|
||||
else stdenv.mkDerivation ({
|
||||
name = "tessdata";
|
||||
buildCommand = ''
|
||||
for lang in ${stdenv.lib.concatMapStringsSep " " (x: x + ".traineddata") enableLanguages}; do
|
||||
install -Dt $out ${languages.all}/$lang
|
||||
done
|
||||
'';
|
||||
preferLocalBuild = true;
|
||||
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
|
||||
# when a hash is given, we make this a fixed output derivation.
|
||||
outputHashMode = "recursive";
|
||||
outputHashAlgo = "sha256";
|
||||
outputHash = enableLanguagesHash;
|
||||
}))
|
||||
)
|
||||
|
||||
, enableLanguagesHash ? null
|
||||
}:
|
||||
|
||||
let
|
||||
passthru = { inherit tesseractBase languages tessdata; };
|
||||
|
||||
tesseractWithData = tesseractBase.overrideAttrs (_: {
|
||||
inherit tesseractBase tessdata;
|
||||
|
||||
buildInputs = [ makeWrapper ];
|
||||
|
||||
buildCommand = ''
|
||||
makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata
|
||||
|
||||
# Recursively link include, share
|
||||
cp -rs --no-preserve=mode $tesseractBase/{include,share} $out
|
||||
|
||||
cp -r --no-preserve=mode $tesseractBase/lib $out
|
||||
# Fixup the store paths in lib so that the tessdata from this derivation is used.
|
||||
if (( ''${#tesseractBase} != ''${#out} )); then
|
||||
echo "Can't replace store paths due to differing lengths"
|
||||
exit 1
|
||||
fi
|
||||
find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \;
|
||||
|
||||
if [[ -d "$tessdata" ]]; then
|
||||
ln -s $tessdata/* $out/share/tessdata
|
||||
else
|
||||
for lang in $tessdata; do
|
||||
ln -s $lang $out/share/tessdata/''${lang#/nix/store*-}
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then
|
||||
# This is a bug in Tesseract's internal tessdata discovery mechanism
|
||||
echo "eng.traineddata must be present in tessdata for Tesseract to work"
|
||||
exit 1
|
||||
fi
|
||||
'';
|
||||
});
|
||||
|
||||
tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru;
|
||||
in
|
||||
tesseract
|
@ -75,19 +75,21 @@ stdenv.mkDerivation rec {
|
||||
cp ${src}/leptonica_mod/* src/
|
||||
'';
|
||||
});
|
||||
tesseract_modded = tesseract.overrideAttrs (attrs: {
|
||||
prePatch = ''
|
||||
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
|
||||
cp ${src}/tesseract_mod/dawg.cpp api/
|
||||
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
|
||||
cp ${src}/tesseract_mod/openclwrapper.h opencl/
|
||||
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
|
||||
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
|
||||
cp ${src}/tesseract_mod/tesscapi.cpp api/
|
||||
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
|
||||
'';
|
||||
patches = [ ./tesseract.patch ];
|
||||
});
|
||||
tesseract_modded = tesseract.override {
|
||||
tesseractBase = tesseract.tesseractBase.overrideAttrs (_: {
|
||||
prePatch = ''
|
||||
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
|
||||
cp ${src}/tesseract_mod/dawg.cpp api/
|
||||
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
|
||||
cp ${src}/tesseract_mod/openclwrapper.h opencl/
|
||||
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
|
||||
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
|
||||
cp ${src}/tesseract_mod/tesscapi.cpp api/
|
||||
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
|
||||
'';
|
||||
patches = [ ./tesseract.patch ];
|
||||
});
|
||||
};
|
||||
in
|
||||
[ zlib libpng ] ++
|
||||
optional enableGSL gsl ++
|
||||
|
Loading…
Reference in New Issue
Block a user