Merge pull request #174322 from fabaff/tokenizers-fix

python310Packages.tokenizers: unstable-2021-08-13 -> 0.12.1
This commit is contained in:
Fabian Affolter 2022-05-25 08:51:43 +02:00 committed by GitHub
commit 740b6b453d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 21 deletions

View File

@ -1,18 +1,24 @@
{ lib { lib
, stdenv , stdenv
, buildPythonPackage
, datasets
, fetchFromGitHub , fetchFromGitHub
, fetchurl , fetchurl
, buildPythonPackage
, rustPlatform
, setuptools-rust
, libiconv , libiconv
, numpy , numpy
, datasets , openssl
, pkg-config
, pytestCheckHook , pytestCheckHook
, pythonOlder
, requests , requests
, rustPlatform
, Security
, setuptools-rust
}: }:
let let
# See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
# about URLs and file names
robertaVocab = fetchurl { robertaVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
@ -49,33 +55,42 @@ let
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
}; };
in buildPythonPackage rec { in
buildPythonPackage rec {
pname = "tokenizers"; pname = "tokenizers";
version = "unstable-2021-08-13"; version = "0.12.1";
disabled = pythonOlder "3.7";
src = fetchFromGitHub { src = fetchFromGitHub {
owner = "huggingface"; owner = "huggingface";
repo = pname; repo = pname;
rev = "e7dd6436dd4a4ffd9e8a4f110ca68e6a38677cb6"; rev = "python-v${version}";
sha256 = "1p7w9a43a9h6ys5nsa4g89l65dj11037p7a1lqkj4x1yc9kv2y1r"; hash = "sha256-XIXKgcqa6ToAH4OkyaaJALOS9F+sD8d5Z71RttRcIsw=";
}; };
cargoDeps = rustPlatform.fetchCargoTarball { cargoDeps = rustPlatform.fetchCargoTarball {
inherit src sourceRoot; inherit src sourceRoot;
name = "${pname}-${version}"; name = "${pname}-${version}";
sha256 = "1yb4jsx6mp9jgd1g3mli6vr6mri2afnwqlmxq1rpvn34z6b3iw9q"; sha256 = "sha256-Euvf0LNMa2Od+6gY1Ldge/7VPrH5mJoZduRRsb+lM/E=";
}; };
sourceRoot = "source/bindings/python"; sourceRoot = "source/bindings/python";
nativeBuildInputs = [ setuptools-rust ] ++ (with rustPlatform; [ nativeBuildInputs = [
pkg-config
setuptools-rust
] ++ (with rustPlatform; [
cargoSetupHook cargoSetupHook
rust.cargo rust.cargo
rust.rustc rust.rustc
]); ]);
buildInputs = lib.optionals stdenv.isDarwin [ buildInputs = [
openssl
] ++ lib.optionals stdenv.isDarwin [
libiconv libiconv
Security
]; ];
propagatedBuildInputs = [ propagatedBuildInputs = [
@ -89,7 +104,7 @@ in buildPythonPackage rec {
]; ];
postUnpack = '' postUnpack = ''
# Add data files for tests, otherwise tests attempt network access. # Add data files for tests, otherwise tests attempt network access
mkdir $sourceRoot/tests/data mkdir $sourceRoot/tests/data
( cd $sourceRoot/tests/data ( cd $sourceRoot/tests/data
ln -s ${robertaVocab} roberta-base-vocab.json ln -s ${robertaVocab} roberta-base-vocab.json
@ -103,24 +118,28 @@ in buildPythonPackage rec {
ln -s ${openaiMerges} openai-gpt-merges.txt ) ln -s ${openaiMerges} openai-gpt-merges.txt )
''; '';
postPatch = '' preCheck = ''
echo 'import multiprocessing; multiprocessing.set_start_method("fork")' >> tests/__init__.py export HOME=$(mktemp -d);
''; '';
preCheck = '' pythonImportsCheck = [
HOME=$TMPDIR "tokenizers"
''; ];
disabledTests = [ disabledTests = [
# Downloads data using the datasets module. # Downloads data using the datasets module
"TestTrainFromIterators" "TestTrainFromIterators"
# Those tests require more data
"test_from_pretrained"
"test_from_pretrained_revision"
"test_continuing_prefix_trainer_mistmatch"
]; ];
meta = with lib; { meta = with lib; {
homepage = "https://github.com/huggingface/tokenizers";
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
homepage = "https://github.com/huggingface/tokenizers";
license = licenses.asl20; license = licenses.asl20;
platforms = platforms.unix;
maintainers = with maintainers; [ ]; maintainers = with maintainers; [ ];
platforms = platforms.unix;
}; };
} }

View File

@ -10370,7 +10370,9 @@ in {
token-bucket = callPackage ../development/python-modules/token-bucket { }; token-bucket = callPackage ../development/python-modules/token-bucket { };
tokenizers = toPythonModule (callPackage ../development/python-modules/tokenizers { }); tokenizers = callPackage ../development/python-modules/tokenizers {
inherit (pkgs.darwin.apple_sdk.frameworks) Security;
};
tokenize-rt = toPythonModule (callPackage ../development/python-modules/tokenize-rt { }); tokenize-rt = toPythonModule (callPackage ../development/python-modules/tokenize-rt { });