python3Packages.tokenizers: 0.8.1 -> 0.9.2
Changelog: https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1 https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2 Changes in the derivation: * Tokenizers uses a newer version of pyo3 that does not require Rust nightly anymore. So, we do not have to use any cheat codes anymore. * Tokenizers is now a mixed Rust/Python project. The way it is set up does not work with Maturin, so switch to setuptools-rust instead. * Add additional data files needed for tests. * Use `pytestCheckHook`.
This commit is contained in:
parent
93b3fa6088
commit
d2e918cc12
@ -2,10 +2,12 @@
|
||||
, rustPlatform
|
||||
, fetchFromGitHub
|
||||
, fetchurl
|
||||
, maturin
|
||||
, pipInstallHook
|
||||
, pytest
|
||||
, setuptools-rust
|
||||
, wheel
|
||||
, numpy
|
||||
, python
|
||||
, pytestCheckHook
|
||||
, requests
|
||||
}:
|
||||
|
||||
@ -18,10 +20,18 @@ let
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
||||
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
||||
};
|
||||
albertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
|
||||
sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
|
||||
};
|
||||
bertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
||||
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
||||
};
|
||||
norvigBig = fetchurl {
|
||||
url = "https://norvig.com/big.txt";
|
||||
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
|
||||
};
|
||||
openaiVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
||||
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
||||
@ -32,38 +42,34 @@ let
|
||||
};
|
||||
in rustPlatform.buildRustPackage rec {
|
||||
pname = "tokenizers";
|
||||
version = "0.8.1";
|
||||
version = "0.9.2";
|
||||
|
||||
src = fetchFromGitHub {
|
||||
owner = "huggingface";
|
||||
repo = pname;
|
||||
rev = "python-v${version}";
|
||||
sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
|
||||
sha256 = "0rsm1g5zfq3ygdb3s8v9xqqpgfzvvkc4n5ik3ahy8sw7pyjljb4m";
|
||||
};
|
||||
|
||||
# Update parking_lot to be compatible with recent Rust versions, that
|
||||
# replace asm! by llvm_asm!:
|
||||
#
|
||||
# https://github.com/Amanieu/parking_lot/pull/223
|
||||
#
|
||||
# Remove once upstream updates this dependency.
|
||||
cargoPatches = [ ./update-parking-lot.diff ];
|
||||
|
||||
cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
|
||||
cargoSha256 = "0yn699dq9hdjh7fyci99ni8mmd5qdhzrsi80grzgf5cch8g38rbi";
|
||||
|
||||
sourceRoot = "source/bindings/python";
|
||||
|
||||
nativeBuildInputs = [
|
||||
maturin
|
||||
pipInstallHook
|
||||
setuptools-rust
|
||||
wheel
|
||||
];
|
||||
|
||||
propagatedBuildInputs = [
|
||||
numpy
|
||||
python
|
||||
];
|
||||
|
||||
# tokenizers uses pyo3, which requires Rust nightly.
|
||||
RUSTC_BOOTSTRAP = 1;
|
||||
installCheckInputs = [
|
||||
pytestCheckHook
|
||||
requests
|
||||
];
|
||||
|
||||
doCheck = false;
|
||||
doInstallCheck = true;
|
||||
@ -74,51 +80,21 @@ in rustPlatform.buildRustPackage rec {
|
||||
( cd $sourceRoot/tests/data
|
||||
ln -s ${robertaVocab} roberta-base-vocab.json
|
||||
ln -s ${robertaMerges} roberta-base-merges.txt
|
||||
ln -s ${albertVocab} albert-base-v1-tokenizer.json
|
||||
ln -s ${bertVocab} bert-base-uncased-vocab.txt
|
||||
ln -s ${norvigBig} big.txt
|
||||
ln -s ${openaiVocab} openai-gpt-vocab.json
|
||||
ln -s ${openaiMerges} openai-gpt-merges.txt )
|
||||
'';
|
||||
|
||||
postPatch = ''
|
||||
# pyo3's build check verifies that Rust is a nightly
|
||||
# version. Disable this check.
|
||||
substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
|
||||
--replace "check_rustc_version()?;" ""
|
||||
|
||||
# Patching the vendored dependency invalidates the file
|
||||
# checksums, so remove them. This should be safe, since
|
||||
# this is just a copy of the vendored dependencies and
|
||||
# the integrity of the vendored dependencies is validated
|
||||
# by cargoSha256.
|
||||
sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
|
||||
$NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
|
||||
|
||||
# Maturin uses the crate name as the wheel name.
|
||||
substituteInPlace Cargo.toml \
|
||||
--replace "tokenizers-python" "tokenizers"
|
||||
'';
|
||||
|
||||
buildPhase = ''
|
||||
maturin build --release --manylinux off
|
||||
${python.interpreter} setup.py bdist_wheel
|
||||
'';
|
||||
|
||||
installPhase = ''
|
||||
# Put the wheels where the pip install hook can find them.
|
||||
install -Dm644 -t dist target/wheels/*.whl
|
||||
pipInstallPhase
|
||||
'';
|
||||
|
||||
installCheckInputs = [
|
||||
pytest
|
||||
requests
|
||||
];
|
||||
|
||||
installCheckPhase = ''
|
||||
# Append paths, or the binding's tokenizer module will be
|
||||
# used, since the test directories have __init__.py
|
||||
pytest --import-mode=append
|
||||
'';
|
||||
|
||||
meta = with stdenv.lib; {
|
||||
homepage = "https://github.com/huggingface/tokenizers";
|
||||
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
|
||||
|
@ -1,63 +0,0 @@
|
||||
diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
|
||||
index f50db71..ea71817 100644
|
||||
--- a/Cargo.lock
|
||||
+++ b/Cargo.lock
|
||||
@@ -269,7 +269,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
-version = "0.3.3"
|
||||
+version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@@ -337,16 +337,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
-version = "0.10.0"
|
||||
+version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
- "lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
- "parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
+ "lock_api 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
+ "parking_lot_core 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
-version = "0.7.0"
|
||||
+version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@@ -409,7 +409,7 @@ dependencies = [
|
||||
"inventory 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-traits 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
- "parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
+ "parking_lot 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"pyo3cls 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex 1.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@@ -768,7 +768,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum itoa 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
|
||||
"checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
"checksum libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)" = "dea0c0405123bba743ee3f91f49b1c7cfb684eef0da0a50110f758ccf24cdff0"
|
||||
-"checksum lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "79b2de95ecb4691949fea4716ca53cdbcfccb2c612e19644a8bad05edcf9f47b"
|
||||
+"checksum lock_api 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75"
|
||||
"checksum maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
||||
"checksum memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
|
||||
"checksum memoffset 0.5.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8"
|
||||
@@ -777,8 +777,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum number_prefix 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
|
||||
"checksum onig 6.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd91ccd8a02fce2f7e8a86655aec67bc6c171e6f8e704118a0e8c4b866a05a8a"
|
||||
"checksum onig_sys 69.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3814583fad89f3c60ae0701d80e87e1fd3028741723deda72d0d4a0ecf0cb0db"
|
||||
-"checksum parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "92e98c49ab0b7ce5b222f2cc9193fc4efe11c6d0bd4f648e374684a6857b1cfc"
|
||||
-"checksum parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7582838484df45743c8434fbff785e8edf260c28748353d44bc0da32e0ceabf1"
|
||||
+"checksum parking_lot 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e"
|
||||
+"checksum parking_lot_core 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3"
|
||||
"checksum paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab4fb1930692d1b6a9cfabdde3d06ea0a7d186518e2f4d67660d8970e2fa647a"
|
||||
"checksum paste-impl 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "a62486e111e571b1e93b710b61e8f493c0013be39629b714cb166bdb06aa5a8a"
|
||||
"checksum pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)" = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677"
|
Loading…
Reference in New Issue
Block a user