diff --git a/Add-missing-include-for-BYTE_ORDER.patch b/Add-missing-include-for-BYTE_ORDER.patch deleted file mode 100644 index 3bc5b75d0cdf3256fa409ce03ebd8a32fc256dc1..0000000000000000000000000000000000000000 --- a/Add-missing-include-for-BYTE_ORDER.patch +++ /dev/null @@ -1,30 +0,0 @@ -From 624091a90e816f555106a1b1f994a45cb4989051 Mon Sep 17 00:00:00 2001 -From: Malcolm Smith -Date: Tue, 12 Jan 2021 13:43:28 +0000 -Subject: [PATCH 5/7] Add missing #include for BYTE_ORDER - ---- - src/util.h | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/src/util.h b/src/util.h -index bf8a758..1680f4b 100644 ---- a/src/util.h -+++ b/src/util.h -@@ -36,6 +36,13 @@ - #include - #endif - -+#if !defined(__APPLE__) && !defined(_WIN32) -+#include -+#if defined(BYTE_ORDER) && defined(__BIG_ENDIAN) && BYTE_ORDER == __BIG_ENDIAN -+#define IS_BIG_ENDIAN -+#endif -+#endif -+ - namespace sentencepiece { - - template --- -2.18.0.huawei.25 - diff --git a/Added-split_digits-to-SentencePieceTrainer.patch b/Added-split_digits-to-SentencePieceTrainer.patch deleted file mode 100644 index 769558784459e0cab2abd07fd809ecc42c51b70f..0000000000000000000000000000000000000000 --- a/Added-split_digits-to-SentencePieceTrainer.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 427d695ab4343568cc46411fbe83ef5ccc619752 Mon Sep 17 00:00:00 2001 -From: mingruimingrui -Date: Sat, 27 Jun 2020 02:56:03 +0800 -Subject: [PATCH 1/7] Added split_digits to SentencePieceTrainer - ---- - src/spec_parser.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/src/spec_parser.h b/src/spec_parser.h -index 729e036..6dd054b 100644 ---- a/src/spec_parser.h -+++ b/src/spec_parser.h -@@ -207,6 +207,7 @@ util::Status SentencePieceTrainer::SetProtoField(const std::string &name, - PARSE_BOOL(split_by_unicode_script); - PARSE_BOOL(split_by_number); - PARSE_BOOL(split_by_whitespace); -+ PARSE_BOOL(split_digits); - PARSE_BOOL(treat_whitespace_as_suffix); - PARSE_REPEATED_STRING(control_symbols); - PARSE_REPEATED_STRING(user_defined_symbols); --- -2.18.0.huawei.25 - diff --git a/Create-options.md.patch b/Create-options.md.patch deleted file mode 100644 index 6f8eb50bd8ba02d8c1c1536292e78bc2dd5954b8..0000000000000000000000000000000000000000 --- a/Create-options.md.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 5c09745aafa151be7ed5d9a9101f3e8c79a8758b Mon Sep 17 00:00:00 2001 -From: stephantul -Date: Thu, 1 Oct 2020 12:49:13 +0200 -Subject: [PATCH 3/7] Create options.md - ---- - doc/options.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 51 insertions(+) - create mode 100644 doc/options.md - -diff --git a/doc/options.md b/doc/options.md -new file mode 100644 -index 0000000..7861fdc ---- /dev/null -+++ b/doc/options.md -@@ -0,0 +1,51 @@ -+# Training options -+ -+The training options for the `spm_train` can be listed using `spm_train --help`. Since the standard `pip install` of sentencepiece does not necessarily install `spm_train`, the options are also listed here. -+ -+``` -+--help (show help) type: bool default: false -+--version (show version) type: bool default: false -+--minloglevel (Messages logged at a lower level than this don't actually get logged anywhere) type: int default: 0 -+--input (comma separated list of input sentences) type: std::string default: "" -+--input_format (Input format. Supported format is `text` or `tsv`.) type: std::string default: "" -+--model_prefix (output model prefix) type: std::string default: "" --model_type (model algorithm: unigram, bpe, word or char) type: std::string default: "unigram" -+--vocab_size (vocabulary size) type: int32 default: 8000 -+--accept_language (comma-separated list of languages this model can accept) type: std::string default: "" -+--self_test_sample_size (the size of self test samples) type: int32 default: 0 -+--character_coverage (character coverage to determine the minimum symbols) type: double default: 0.9995 -+--input_sentence_size (maximum size of sentences the trainer loads) type: int32 default: 0 -+--shuffle_input_sentence (Randomly sample input sentences in advance. Valid when --input_sentence_size > 0) type: bool default: true -+--seed_sentencepiece_size (the size of seed sentencepieces) type: int32 default: 1000000 -+--shrinking_factor (Keeps top shrinking_factor pieces with respect to the loss) type: double default: 0.75 -+--num_threads (number of threads for training) type: int32 default: 16 -+--num_sub_iterations (number of EM sub-iterations) type: int32 default: 2 -+--max_sentencepiece_length (maximum length of sentence piece) type: int32 default: 16 -+--max_sentence_length (maximum length of sentence in byte) type: int32 default: 4192 -+--split_by_unicode_script (use Unicode script to split sentence pieces) type: bool default: true -+--split_by_number (split tokens by numbers (0-9)) type: bool default: true -+--split_by_whitespace (use a white space to split sentence pieces) type: bool default: true -+--split_digits (split all digits (0-9) into separate pieces) type: bool default: false -+--treat_whitespace_as_suffix (treat whitespace marker as suffix instead of prefix.) type: bool default: false -+--control_symbols (comma separated list of control symbols) type: std::string default: "" -+--user_defined_symbols (comma separated list of user defined symbols) type: std::string default: "" -+--required_chars (UTF8 characters in this flag are always used in the character set regardless of --character_coverage) type: std::string default: "" -+--byte_fallback (decompose unknown pieces into UTF-8 byte pieces) type: bool default: false -+--vocabulary_output_piece_score (Define score in vocab file) type: bool default: true -+--normalization_rule_name (Normalization rule name. Choose from nfkc or identity) type: std::string default: "nmt_nfkc" -+--normalization_rule_tsv (Normalization rule TSV file. ) type: std::string default: "" -+--denormalization_rule_tsv (Denormalization rule TSV file.) type: std::string default: "" -+--add_dummy_prefix (Add dummy whitespace at the beginning of text) type: bool default: true -+--remove_extra_whitespaces (Removes leading, trailing, and duplicate internal whitespace) type: bool default: true -+--hard_vocab_limit (If set to false, --vocab_size is considered as a soft limit.) type: bool default: true -+--use_all_vocab (If set to true, use all tokens as vocab. Valid for word/char models.) type: bool default: false -+--unk_id (Override UNK () id.) type: int32 default: 0 -+--bos_id (Override BOS () id. Set -1 to disable BOS.) type: int32 default: 1 -+--eos_id (Override EOS () id. Set -1 to disable EOS.) type: int32 default: 2 -+--pad_id (Override PAD () id. Set -1 to disable PAD.) type: int32 default: -1 -+--unk_piece (Override UNK () piece.) type: std::string default: "" -+--bos_piece (Override BOS () piece.) type: std::string default: "" -+--eos_piece (Override EOS () piece.) type: std::string default: "" -+--pad_piece (Override PAD () piece.) type: std::string default: "" -+--unk_surface (Dummy surface string for . In decoding is decoded to `unk_surface`.) type: std::string default: " ⁇ " -+--train_extremely_large_corpus (Increase bit depth for unigram tokenization.) type: bool default: false -+``` --- -2.18.0.huawei.25 - diff --git a/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch b/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch deleted file mode 100644 index 13534310fcdcc78169cdaa55dd8ca5cfcf69fb0c..0000000000000000000000000000000000000000 --- a/Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch +++ /dev/null @@ -1,30 +0,0 @@ -From 2ea571b8e509809bbe28e6cc3f1488b3cfde1ef9 Mon Sep 17 00:00:00 2001 -From: Kentaro Hayashi -Date: Sat, 17 Oct 2020 16:54:20 +0900 -Subject: [PATCH 4/7] Fix FTBFS on armel, mips, powerpc, m68k and sh4 - ---- - src/CMakeLists.txt | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 511b2ec..87765e5 100644 ---- a/src/CMakeLists.txt -+++ b/src/CMakeLists.txt -@@ -197,6 +197,13 @@ target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static - if (SPM_ENABLE_SHARED) - target_link_libraries(sentencepiece ${SPM_LIBS}) - target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) -+ if ((${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l") OR -+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "mips") OR -+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "m68k") OR -+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc") OR -+ (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4")) -+ list(APPEND SPM_LIBS "atomic") -+ endif() - set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static) - set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0) - set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) --- -2.18.0.huawei.25 - diff --git a/Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch b/Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch deleted file mode 100644 index 7691daada9a7e04d31cd26beddf8b0e096955ef4..0000000000000000000000000000000000000000 --- a/Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 82b8b6f61403fcfcef673ee49ed2dfe475ba4cf2 Mon Sep 17 00:00:00 2001 -From: Sarubi -Date: Tue, 23 Feb 2021 20:47:25 +0530 -Subject: [PATCH] Removed codes where Zero Width Joiner replaced with - whitespace. - ---- - data/nmt_nfkc.tsv | 3 +-- - data/nmt_nfkc_cf.tsv | 3 +-- - src/builder.cc | 1 - - 3 files changed, 2 insertions(+), 5 deletions(-) - -diff --git a/data/nmt_nfkc.tsv b/data/nmt_nfkc.tsv -index 1ce2b71..5c8b48b 100644 ---- a/data/nmt_nfkc.tsv -+++ b/data/nmt_nfkc.tsv -@@ -57263,8 +57263,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ - 200A 20 #   => - 200B 20 # ​ => - 200C 20 # ‌ => --200D 20 # ‍ => --200E 20 # ‎ => -+200E 20 # ‎ => - 200F 20 # ‏ => - 2011 2010 # ‑ => ‐ - 2017 20 333 # ‗ => ̳ -diff --git a/data/nmt_nfkc_cf.tsv b/data/nmt_nfkc_cf.tsv -index 2178882..0d0e708 100644 ---- a/data/nmt_nfkc_cf.tsv -+++ b/data/nmt_nfkc_cf.tsv -@@ -57980,8 +57980,7 @@ FB9 F90 FB5 # ྐྵ => ྐྵ - 200A 20 #   => - 200B 20 # ​ => - 200C 20 # ‌ => --200D 20 # ‍ => --200E 20 # ‎ => -+200E 20 # ‎ => - 200F 20 # ‏ => - 2011 2010 # ‑ => ‐ - 2017 20 333 # ‗ => ̳ -diff --git a/src/builder.cc b/src/builder.cc -index d9442d3..9f47aac 100644 ---- a/src/builder.cc -+++ b/src/builder.cc -@@ -366,7 +366,6 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) { - nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK - nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER - nfkc_map[{0x200C}] = {0x20}; // ZERO WIDTH NON-JOINER -- nfkc_map[{0x200D}] = {0x20}; // ZERO WIDTH JOINER - - // Ascii Control characters - nfkc_map[{0x0001}] = {}; --- - diff --git a/Restore-the-sentence-boundary-marker-insertion-for-t.patch b/Restore-the-sentence-boundary-marker-insertion-for-t.patch deleted file mode 100644 index 8d53ad6cc50e0420c34f8f66f05fdd64812ae2a4..0000000000000000000000000000000000000000 --- a/Restore-the-sentence-boundary-marker-insertion-for-t.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 21aa7a9d6a3bd6a98c480bea02e0e81b21f411af Mon Sep 17 00:00:00 2001 -From: joe <219651+AdolfVonKleist@users.noreply.github.com> -Date: Mon, 22 Mar 2021 17:26:20 +0000 -Subject: [PATCH 7/7] Restore the sentence boundary marker insertion for the - unigram trainer. Dramatically speeds up training time. - ---- - src/unigram_model_trainer.cc | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc -index 5f26771..94c7adb 100644 ---- a/src/unigram_model_trainer.cc -+++ b/src/unigram_model_trainer.cc -@@ -119,6 +119,7 @@ TrainerModel::SentencePieces Trainer::MakeSeedSentencePieces() const { - all_chars[string_util::UnicodeCharToUTF8(c)] += w.second; - } - } -+ array.push_back(kSentenceBoundary); // sentence boundary marker. - } - - const node_int_type n = array.size(); --- -2.18.0.huawei.25 - diff --git a/fix_of_an_unattainable_condition.patch b/fix_of_an_unattainable_condition.patch deleted file mode 100644 index 4882a2d782f0753bfe6611d62de0d6d556b0f47d..0000000000000000000000000000000000000000 --- a/fix_of_an_unattainable_condition.patch +++ /dev/null @@ -1,22 +0,0 @@ -diff --git a/third_party/esaxx/sais.hxx b/third_party/esaxx/sais.hxx -index f1702f8..b9071c8 100644 ---- a/third_party/esaxx/sais.hxx -+++ b/third_party/esaxx/sais.hxx -@@ -179,7 +179,7 @@ typedef typename std::iterator_traits::value_type char_type; - sort all the S-substrings */ - if(fs < (maxthreads * k)) { - index_type *C, *B; -- if((C = new index_type[maxthreads * k]) == 0) { return -2; } -+ C = new index_type[maxthreads * k]; - B = (1 < maxthreads) ? C + k : C; - getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */ - #ifdef _OPENMP -@@ -271,7 +271,7 @@ typedef typename std::iterator_traits::value_type char_type; - /* stage 3: induce the result for the original problem */ - if(fs < (maxthreads * k)) { - index_type *B, *C; -- if((C = new index_type[maxthreads * k]) == 0) { return -2; } -+ C = new index_type[maxthreads * k]; - B = (1 < maxthreads) ? C + k : C; - /* put all left-most S characters into their buckets */ - getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */ diff --git a/only-install-proto-headers-if-not-using-builtin-prot.patch b/only-install-proto-headers-if-not-using-builtin-prot.patch deleted file mode 100644 index 9a72915f2afbe299674a7ff6f2c4373772fded01..0000000000000000000000000000000000000000 --- a/only-install-proto-headers-if-not-using-builtin-prot.patch +++ /dev/null @@ -1,29 +0,0 @@ -From a069cd5518c11750b734b85887dcc74ec6f9457f Mon Sep 17 00:00:00 2001 -From: mark -Date: Wed, 10 Feb 2021 10:59:56 -0800 -Subject: [PATCH 6/7] only install proto headers if not using builtin proto - ---- - src/CMakeLists.txt | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 87765e5..3d31259 100644 ---- a/src/CMakeLists.txt -+++ b/src/CMakeLists.txt -@@ -272,8 +272,11 @@ install(TARGETS ${SPM_INSTALLTARGETS} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) --install(FILES sentencepiece_trainer.h sentencepiece_processor.h -+install(FILES sentencepiece_trainer.h sentencepiece_processor.h ${SPM_PROTO_HDRS} - DESTINATION ${CMAKE_INSTALL_INCDIR}) -+if (NOT SPM_USE_BUILTIN_PROTOBUF) -+ install(FILES ${SPM_PROTO_HDRS} DESTINATION ${CMAKE_INSTALL_INCDIR}) -+endif() - - file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir) - --- -2.18.0.huawei.25 - diff --git a/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch b/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch deleted file mode 100644 index ec371fe822126f82b19c398413efc0418873011e..0000000000000000000000000000000000000000 --- a/sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch +++ /dev/null @@ -1,27 +0,0 @@ -From cc1380a1608d8e7913e943e8530798c882c4fe6c Mon Sep 17 00:00:00 2001 -From: Aaron Burke -Date: Fri, 21 Aug 2020 10:15:42 -0700 -Subject: [PATCH 2/7] sentencepiece.pc should be installed from - CMAKE_CURRENT_BINARY_DIR, not CMAKE_BINARY_DIR, to support being included - (and installed) from other projects - ---- - CMakeLists.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 6481dfd..9124f9e 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -78,7 +78,7 @@ configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") - configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY) - - if (NOT MSVC) -- install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) - endif() - - include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR}) --- -2.18.0.huawei.25 - diff --git a/sentencepiece.spec b/sentencepiece.spec index 4bf4dfdc201b342c8e31df40da53b7eab1fcc1d0..69163fd43fa37274380832a014408a3ccce5c317 100644 --- a/sentencepiece.spec +++ b/sentencepiece.spec @@ -1,19 +1,10 @@ Name: sentencepiece -Version: 0.1.92 -Release: 6 +Version: 0.1.98 +Release: 1 Summary: An unsupervised text tokenizer and detokenizer License: Apache-2.0 URL: https://github.com/google/sentencepiece Source0: https://github.com/google/sentencepiece/archive/v%{version}.tar.gz -Patch0: Removed-codes-where-Zero-Width-Joiner-replaced-with-.patch -Patch1: fix_of_an_unattainable_condition.patch -Patch2: Added-split_digits-to-SentencePieceTrainer.patch -Patch3: sentencepiece.pc-should-be-installed-from-CMAKE_CURR.patch -Patch4: Create-options.md.patch -Patch5: Fix-FTBFS-on-armel-mips-powerpc-m68k-and-sh4.patch -Patch6: Add-missing-include-for-BYTE_ORDER.patch -Patch7: only-install-proto-headers-if-not-using-builtin-prot.patch -Patch8: Restore-the-sentence-boundary-marker-insertion-for-t.patch BuildRequires: gcc-c++ gcc autoconf pkgconfig protobuf-compiler protobuf BuildRequires: cmake >= 3.14.0 Requires: protobuf protobuf-compiler @@ -47,6 +38,8 @@ cd ../../ %install cd cmake/build make install +sed -i'' -e "s,%{buildroot},," %{buildroot}%{_libdir}/pkgconfig/sentencepiece.pc +sed -i'' -e "s,${prefix}/lib,%{_libdir}," %{buildroot}%{_libdir}/pkgconfig/sentencepiece.pc %files %defattr(-,root,root) @@ -56,6 +49,9 @@ make install %{_includedir}/sentencepiece*.h %changelog +* Fri May 12 2023 liuyongqi - 0.1.98-1 +- Sentencepiece version updata: Upgraded from 0.1.92 to 0.1.98 + * Fri Nov 27 2021 xiefangqi - 0.1.92.6 - Fix split_digits support to SentencepieceTrainer spec parser - Add sentencepiece.pc install @@ -64,13 +60,18 @@ make install - Fix endian problem on android plarform - Fix pb protobuf header file can't find problem - Restore the sentence boundary + * Tue Nov 16 2021 xiefangqi - 0.1.92.5 - add README.md/README.en.md + * Tue Nov 2 2021 xiefangqi - 0.1.92-4 - fix of an unattainable condition + * Tue Nov 2 2021 xiefangqi - 0.1.92-3 - Prevent Zero Width Joiner replaced with whitespace + * Wed Sep 29 2021 xiefangqi - 0.1.92-2 - add test cases + * Wed Sep 23 2021 xiefangqi - 0.1.92-1 - package init diff --git a/v0.1.92.tar.gz b/v0.1.98.tar.gz similarity index 55% rename from v0.1.92.tar.gz rename to v0.1.98.tar.gz index a613caf2fc083e9fe4c9bb5bce043803e66d3371..405164ec1e7e7dd348ae8c8eec69b38489325f25 100644 Binary files a/v0.1.92.tar.gz and b/v0.1.98.tar.gz differ