Module kiwipiepy.transformers_addon
Added in version: 0.15.1
transformers_addon 모듈은 Kiwi의 SwTokenizer를
huggingface transformers의 tokenizer와
호환이 가능하도록 래핑한 KiwiTokenizer 클래스를 제공합니다.
이 기능을 사용하기 위해서는 transformers>=4.12 이 필요합니다.
from transformers import AutoTokenizer
import kiwipiepy.transformers_addon
# kiwipiepy.transformers_addon를 import하면
# 자동으로 KiwiTokenizer가 AutoTokenizer에 register됩니다.
# KiwiTokenizer는 huggingface transformers의 토크나이저와 대부분의 기능이 호환되므로
# 기존 transformers 기반의 코드를 그대로 재사용할 수 있습니다.
# SwTokenizer가 some_path/tokenizer.json에 저장되어있다고 가정
tokenizer = AutoTokenizer.from_pretrained('some_path')
tokenizer.encode("한국어를 고려한 토크나이저!")
KiwiTokenizer의 주요 기능들은 transformers와 대부분 호환됩니다만, 다음 기능들은 현재 호환되지 않습니다.
- add_tokens,- add_special_tokens등 새 토큰을 추가하는 기능
- encode_plus의- stride,- is_split_into_words,- return_overflowing_tokens,- return_special_tokens_mask,- return_length인자
Expand source code
'''
.. versionadded:: 0.15.1
`transformers_addon` 모듈은 Kiwi의 SwTokenizer를 
[huggingface transformers의 tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer)와 
호환이 가능하도록 래핑한 `KiwiTokenizer` 클래스를 제공합니다.
이 기능을 사용하기 위해서는 `transformers>=4.12` 이 필요합니다.
```python
from transformers import AutoTokenizer
import kiwipiepy.transformers_addon
# kiwipiepy.transformers_addon를 import하면
# 자동으로 KiwiTokenizer가 AutoTokenizer에 register됩니다.
# KiwiTokenizer는 huggingface transformers의 토크나이저와 대부분의 기능이 호환되므로
# 기존 transformers 기반의 코드를 그대로 재사용할 수 있습니다.
# SwTokenizer가 some_path/tokenizer.json에 저장되어있다고 가정
tokenizer = AutoTokenizer.from_pretrained('some_path')
tokenizer.encode("한국어를 고려한 토크나이저!")
```
`KiwiTokenizer`의 주요 기능들은 transformers와 대부분 호환됩니다만, 다음 기능들은 현재 호환되지 않습니다.
* `add_tokens`, `add_special_tokens` 등 새 토큰을 추가하는 기능
* `encode_plus`의  `stride`, `is_split_into_words`, `return_overflowing_tokens`, `return_special_tokens_mask`, `return_length` 인자
'''
import os
import itertools
from typing import Union, List, Optional, Dict, Tuple
import numpy as np
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import (
    PreTrainedTokenizerBase, 
    TextInput, 
    TextInputPair, 
    PreTokenizedInput, 
    PreTokenizedInputPair, 
    EncodedInput, 
    EncodedInputPair,
    PaddingStrategy,
    TruncationStrategy,
    TensorType,
    BatchEncoding, 
)
from kiwipiepy.sw_tokenizer import SwTokenizer, SwTokenizerConfig
def _group_by_two(iterator):
    try:
        while True:
            a = next(iterator)
            b = next(iterator)
            yield a, b
    except StopIteration:
        pass
class KiwiTokenizer(PreTrainedTokenizerBase):
    vocab_files_names = {"tokenizer_file": "tokenizer.json"}
    def __init__(self, tokenizer_file=None, **kwargs):
        if tokenizer_file is None:
            raise ValueError(f"Cannot instantiate tokenizer from {tokenizer_file!r}")
        
        self._tokenizer = SwTokenizer(tokenizer_file)
        
        super().__init__(**kwargs)
        self._post_processor = self._tokenizer.config.additional.get('post_processor') if isinstance(self._tokenizer.config.additional, dict) else None
        if self._post_processor not in (None, 'bert'):
            raise ValueError(f"Unknown post_processor `{self._post_processor!r}`")
        self._bos_token = self._tokenizer.bos_token
        self._eos_token = self._tokenizer.eos_token
        self._unk_token = self._tokenizer.unk_token
        self._sep_token = self._tokenizer.sep_token
        self._pad_token = self._tokenizer.pad_token
        self._cls_token = self._tokenizer.cls_token
        self._mask_token = self._tokenizer.mask_token
    
    @property
    def unk_token(self) -> str:
        return self._tokenizer.unk_token
    
    @unk_token.setter
    def unk_token(self, s):
        if s != self._tokenizer.unk_token:
            raise AttributeError("can't set attribute 'unk_token'")
    @property
    def cls_token(self) -> str:
        return self._tokenizer.cls_token
    @cls_token.setter
    def cls_token(self, s):
        if s != self._tokenizer.cls_token:
            raise AttributeError("can't set attribute 'cls_token'")
    @property
    def sep_token(self) -> str:
        return self._tokenizer.sep_token
    
    @sep_token.setter
    def sep_token(self, s):
        if s != self._tokenizer.sep_token:
            raise AttributeError("can't set attribute 'sep_token'")
    @property
    def pad_token(self) -> str:
        return self._tokenizer.pad_token
    @pad_token.setter
    def pad_token(self, s):
        if s != self._tokenizer.pad_token:
            raise AttributeError("can't set attribute 'pad_token'")
    @property
    def mask_token(self) -> str:
        return self._tokenizer.mask_token
    
    @mask_token.setter
    def mask_token(self, s):
        if s != self._tokenizer.mask_token:
            raise AttributeError("can't set attribute 'mask_token'")
    @property
    def bos_token(self) -> str:
        return self._tokenizer.bos_token
    
    @bos_token.setter
    def bos_token(self, s):
        if s != self._tokenizer.bos_token:
            raise AttributeError("can't set attribute 'bos_token'")
    @property
    def eos_token(self) -> str:
        return self._tokenizer.eos_token
    @eos_token.setter
    def eos_token(self, s):
        if s != self._tokenizer.eos_token:
            raise AttributeError("can't set attribute 'eos_token'")
    @property
    def unk_token_id(self) -> str:
        return self._tokenizer.unk_token_id
    @property
    def cls_token_id(self) -> str:
        return self._tokenizer.cls_token_id
    @property
    def sep_token_id(self) -> str:
        return self._tokenizer.sep_token_id
    
    @property
    def pad_token_id(self) -> str:
        return self._tokenizer.pad_token_id
    @property
    def mask_token_id(self) -> str:
        return self._tokenizer.mask_token_id
    
    @property
    def bos_token_id(self) -> str:
        return self._tokenizer.bos_token_id
    
    @property
    def eos_token_id(self) -> str:
        return self._tokenizer.eos_token_id
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names
        if not isinstance(batch_text_or_text_pairs, (list, tuple)):
            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
        input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
            batch_text_or_text_pairs, add_special_tokens, 
            return_token_type_ids, return_attention_mask, return_offsets_mapping,
            return_as_list=(return_tensors is None),
            padding_strategy=padding_strategy, 
            truncation_strategy=truncation_strategy, 
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
        )
        data = dict(input_ids=input_ids)
        if return_attention_mask: data['attention_mask'] = attention_mask
        if return_token_type_ids: data['token_type_ids'] = token_type_ids
        if return_offsets_mapping: data['offset_mapping'] = offset_mapping
        for i in input_ids:
            self._eventual_warn_about_too_long_sequence(i, max_length, verbose)
        return BatchEncoding(data, tensor_type=return_tensors)
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names
        text = text if text_pair is None else (text, text_pair)
        input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
            [text], add_special_tokens, 
            return_token_type_ids, return_attention_mask, return_offsets_mapping,
            return_as_list=(return_tensors is None),
            padding_strategy=padding_strategy, 
            truncation_strategy=truncation_strategy, 
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
        )
        if return_tensors is None and not return_overflowing_tokens:
            input_ids = input_ids[0]
            if return_attention_mask: attention_mask = attention_mask[0]
            if return_token_type_ids: token_type_ids = token_type_ids[0]
            if return_offsets_mapping: offset_mapping = offset_mapping[0]
            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
        else:
            self._eventual_warn_about_too_long_sequence(input_ids[0], max_length, verbose)
        data = dict(input_ids=input_ids)
        if return_attention_mask: data['attention_mask'] = attention_mask
        if return_token_type_ids: data['token_type_ids'] = token_type_ids
        if return_offsets_mapping: data['offset_mapping'] = offset_mapping
            
        return BatchEncoding(data, tensor_type=return_tensors)
    def _make_encoded(
        self, 
        batch_text_or_text_pairs, 
        add_special_tokens, 
        return_token_type_ids, 
        return_attention_mask,
        return_offsets_mapping,
        return_as_list = False,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
    ):
        input_ids = []
        attention_mask = []
        token_type_ids = []
        offset_mapping = []
        if isinstance(batch_text_or_text_pairs[0], str): # single
            special_token_size = self.num_special_tokens_to_add(False) if add_special_tokens else 0
            for i in self._tokenizer.encode(batch_text_or_text_pairs, return_offsets=return_offsets_mapping):
                if return_offsets_mapping:
                    i, i_offset = i
                    i_offset = i_offset.astype(np.int64)
                i = i.astype(np.int64)
                if (truncation_strategy in (TruncationStrategy.LONGEST_FIRST, TruncationStrategy.ONLY_FIRST) 
                    and len(i) > max_length - special_token_size):
                    i = i[:max_length - special_token_size]
                    if return_offsets_mapping:
                        i_offset = i_offset[:max_length - special_token_size]
                if add_special_tokens and self._post_processor == 'bert':
                    i = np.pad(i.astype(np.int64), (1, 1))
                    i[0] = self._tokenizer.cls_token_id
                    i[-1] = self._tokenizer.sep_token_id
                    if return_offsets_mapping:
                        i_offset = np.pad(i_offset, ((1, 1), (0, 0)))
                input_ids.append(i)
                if return_attention_mask: attention_mask.append(np.ones_like(i))
                if return_token_type_ids: token_type_ids.append(np.zeros_like(i))
                if return_offsets_mapping: offset_mapping.append(i_offset)
        
        else: # pair
            special_token_size = self.num_special_tokens_to_add(True) if add_special_tokens else 0
            for i, j in _group_by_two(self._tokenizer.encode(itertools.chain.from_iterable(batch_text_or_text_pairs), return_offsets=return_offsets_mapping)):
                if return_offsets_mapping:
                    i, i_offset = i
                    j, j_offset = j
                    i_offset = i_offset.astype(np.int64)
                    j_offset = j_offset.astype(np.int64)
                if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
                    and len(i) + len(j) > max_length - special_token_size):
                    if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
                        t = len(i) + len(j) - (max_length - special_token_size)
                        d = abs(len(i) - len(j))
                        trunc_size = min(d, t)
                        if len(i) > len(j):
                            i = i[:-trunc_size]
                            if return_offsets_mapping:
                                i_offset = i_offset[:-trunc_size]
                        else:
                            j = j[:-trunc_size]
                            if return_offsets_mapping:
                                j_offset = j_offset[:-trunc_size]
                        
                        if t > d:
                            i = i[:-((t - d + 1) // 2)]
                            j = j[:-((t - d) // 2)]
                            if return_offsets_mapping:
                                i_offset = i_offset[:-((t - d + 1) // 2)]
                                j_offset = j_offset[:-((t - d) // 2)]
                    elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
                        i = i[:max(max_length - special_token_size - len(j), 0)]
                        if return_offsets_mapping:
                            i_offset = i_offset[:max(max_length - special_token_size - len(j), 0)]
                    elif truncation_strategy == TruncationStrategy.ONLY_SECOND:
                        j = j[:max(max_length - special_token_size - len(i), 0)]
                        if return_offsets_mapping:
                            j_offset = j_offset[:max(max_length - special_token_size - len(i), 0)]
                if add_special_tokens and self._post_processor == 'bert':
                    c = np.concatenate([[self._tokenizer.cls_token_id], i, [self._tokenizer.sep_token_id], j, [self._tokenizer.sep_token_id]])
                    t = (np.arange(len(c)) >= len(i) + 2).astype(np.int64)
                    if return_offsets_mapping:
                        c_offset = np.concatenate([np.pad(i_offset, ((1, 1), (0, 0))), np.pad(j_offset, ((0, 1), (0, 0)))], axis=0)
                else:
                    c = np.concatenate([i, j])
                    t = (np.arange(len(c)) >= len(i)).astype(np.int64)
                    if return_offsets_mapping:
                        c_offset = np.concatenate([i_offset, j_offset], axis=0)
                input_ids.append(c)
                if return_attention_mask: attention_mask.append(np.ones_like(c))
                if return_token_type_ids: token_type_ids.append(t)
                if return_offsets_mapping: offset_mapping.append(c_offset)
        if padding_strategy == PaddingStrategy.LONGEST:
            final_length = max(map(len, input_ids))
            if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
        elif padding_strategy == PaddingStrategy.MAX_LENGTH:
            final_length = max(max(map(len, input_ids)), max_length or 0)
            if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
        else:
            final_length = None
        
        if final_length:
            for i in range(len(input_ids)):
                input_ids[i] = np.pad(input_ids[i], (0, final_length - len(input_ids[i])), constant_values=self._tokenizer.pad_token_id)
                
                try: attention_mask[i] = np.pad(attention_mask[i], (0, final_length - len(attention_mask[i])))
                except IndexError: pass
                
                try: token_type_ids[i] = np.pad(token_type_ids[i], (0, final_length - len(token_type_ids[i])))
                except IndexError: pass
                
                try: offset_mapping[i] = np.pad(offset_mapping[i], ((0, final_length - len(offset_mapping[i])), (0, 0)))
                except IndexError: pass
        if return_as_list:
            input_ids = list(map(np.ndarray.tolist, input_ids))
            attention_mask = list(map(np.ndarray.tolist, attention_mask))
            token_type_ids = list(map(np.ndarray.tolist, token_type_ids))
            offset_mapping = list(map(lambda x:list(map(tuple, x)), (map(np.ndarray.tolist, offset_mapping))))
        return input_ids, attention_mask, token_type_ids, offset_mapping
    def _decode(
        self, 
        token_ids, 
        skip_special_tokens = False, 
        clean_up_tokenization_spaces = True,
    ):
        if isinstance(token_ids, int): token_ids = [token_ids]
        if skip_special_tokens:
            token_ids = [i for i in token_ids if i not in self.all_special_ids]
        return self._tokenizer.decode(token_ids)
    def get_added_vocab(self) -> Dict[str, int]:
        return {}
    
    def get_vocab(self):
        return self._tokenizer.vocab
    
    @property
    def vocab(self):
        return self._tokenizer.vocab
    
    @property
    def vocab_size(self) -> int:
        return len(self._tokenizer.vocab)
    def __len__(self):
        return len(self._tokenizer)
    @property
    def is_fast(self) -> bool:
        return False
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        if tokens is None:
            return None
        if isinstance(tokens, str):
            return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id)
        ids = []
        for token in tokens:
            ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id))
        return ids
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        if self._post_processor == 'bert':
            return 3 if pair else 2
        return 0
    def _add_tokens(self, new_tokens, special_tokens = False) -> int:
        if all(t in self.vocab for t in new_tokens):
            return 0
        raise NotImplementedError("`KiwiTokenizer.add_tokens` is not support yet.")
    def tokenize(
        self, 
        text: str, 
        pair: Optional[str] = None, 
        add_special_tokens: bool = False,
    ) -> List[str]:
        ids = self.encode(text, pair, add_special_tokens=add_special_tokens)
        return list(map(self._tokenizer.id2vocab.__getitem__, ids))
    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        if isinstance(ids, int):
            if skip_special_tokens and ids in self.all_special_ids: return ''
            return self._tokenizer.id2vocab[ids]
        if skip_special_tokens:
            ids = [i for i in ids if i not in self.all_special_ids]
        return list(map(self._tokenizer.id2vocab.__getitem__, ids))
    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
        save_directory = str(save_directory)
        if self.slow_tokenizer_class is None and legacy_format is True:
            raise ValueError(
                "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You "
                "might consider leaving the legacy_format at `None` or setting it to `False`."
            )
        tokenizer_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.json"
        )
        self._tokenizer.save(tokenizer_file)
        file_names = file_names + (tokenizer_file,)
        return file_names
    @property
    def added_tokens_decoder(self):
        return {}
AutoTokenizer.register('KiwiTokenizer', None, KiwiTokenizer)Classes
- class KiwiTokenizer (tokenizer_file=None, **kwargs)
- 
Base class for [ PreTrainedTokenizer] and [PreTrainedTokenizerFast].Handles shared (mostly boiler plate) methods for those two classes. Class attributes (overridden by derived classes) - **vocab_files_names** (<code>Dict\[str, str]</code>) -- A dictionary with, as keys, the <code>\_\_init\_\_</code> keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - **pretrained_vocab_files_map** (<code>Dict\[str, Dict\[str, str]]</code>) -- A dictionary of dictionaries, with the high-level keys being the <code>\_\_init\_\_</code> keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` of the pretrained models with, as associated values, the <code>url</code> to the associated pretrained vocabulary file. - **max_model_input_sizes** (<code>Dict\[str, Optional\[int]]</code>) -- A dictionary with, as keys, the `short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or <code>None</code> if the model has no maximum input size. - **pretrained_init_configuration** (<code>Dict\[str, Dict\[str, Any]]</code>) -- A dictionary with, as keys, the `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to pass to the <code>\_\_init\_\_</code> method of the tokenizer class for this pretrained model when loading the tokenizer with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`] method. - **model_input_names** (<code>List\[str]</code>) -- A list of inputs expected in the forward pass of the model. - **padding_side** (<code>str</code>) -- The default value for the side on which the model should have padding applied. Should be `'right'` or `'left'`. - **truncation_side** (<code>str</code>) -- The default value for the side on which the model should have truncation applied. Should be `'right'` or `'left'`.Argsmodel_max_length ( int, optional): The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is loaded with [~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained], this will be set to the value stored for the associated model inmax_model_input_sizes(see above). If no value is provided, will default to VERY_LARGE_INTEGER (int(1e30)). padding_side (str, optional): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. truncation_side (str, optional): The side on which the model should have truncation applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. chat_template (str, optional): A Jinja template string that will be used to format lists of chat messages. See https://huggingface.co/docs/transformers/chat_templating for a full description. model_input_names (List[string], optional): The list of inputs accepted by the forward pass of the model (like"token_type_ids"or"attention_mask"). Default value is picked from the class attribute of the same name. bos_token (strortokenizers.AddedToken, optional): A special token representing the beginning of a sentence. Will be associated toself.bos_tokenandself.bos_token_id. eos_token (strortokenizers.AddedToken, optional): A special token representing the end of a sentence. Will be associated toself.eos_tokenandself.eos_token_id. unk_token (strortokenizers.AddedToken, optional): A special token representing an out-of-vocabulary token. Will be associated toself.unk_tokenandself.unk_token_id. sep_token (strortokenizers.AddedToken, optional): A special token separating two different sentences in the same input (used by BERT for instance). Will be associated toself.sep_tokenandself.sep_token_id. pad_token (strortokenizers.AddedToken, optional): A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. Will be associated toself.pad_tokenandself.pad_token_id. cls_token (strortokenizers.AddedToken, optional): A special token representing the class of the input (used by BERT for instance). Will be associated toself.cls_tokenandself.cls_token_id. mask_token (strortokenizers.AddedToken, optional): A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). Will be associated toself.mask_tokenandself.mask_token_id. additional_special_tokens (tuple or list ofstrortokenizers.AddedToken, optional): A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding withskip_special_tokensis set to True. If they are not part of the vocabulary, they will be added at the end of the vocabulary. clean_up_tokenization_spaces (bool, optional, defaults toTrue): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. split_special_tokens (bool, optional, defaults toFalse): Whether or not the special tokens should be split during the tokenization process. The default behavior is to not split special tokens. This means that if<s>is thebos_token, thentokenizer.tokenize("<s>") = ['<s>]. Otherwise, ifsplit_special_tokens=True<code>, then </code>tokenizer.tokenize("<s>")<code> will be give </code>['<', 's', '>']<code>. This argument is only supported for </code>slowtokenizers for the moment.Expand source codeclass KiwiTokenizer(PreTrainedTokenizerBase): vocab_files_names = {"tokenizer_file": "tokenizer.json"} def __init__(self, tokenizer_file=None, **kwargs): if tokenizer_file is None: raise ValueError(f"Cannot instantiate tokenizer from {tokenizer_file!r}") self._tokenizer = SwTokenizer(tokenizer_file) super().__init__(**kwargs) self._post_processor = self._tokenizer.config.additional.get('post_processor') if isinstance(self._tokenizer.config.additional, dict) else None if self._post_processor not in (None, 'bert'): raise ValueError(f"Unknown post_processor `{self._post_processor!r}`") self._bos_token = self._tokenizer.bos_token self._eos_token = self._tokenizer.eos_token self._unk_token = self._tokenizer.unk_token self._sep_token = self._tokenizer.sep_token self._pad_token = self._tokenizer.pad_token self._cls_token = self._tokenizer.cls_token self._mask_token = self._tokenizer.mask_token @property def unk_token(self) -> str: return self._tokenizer.unk_token @unk_token.setter def unk_token(self, s): if s != self._tokenizer.unk_token: raise AttributeError("can't set attribute 'unk_token'") @property def cls_token(self) -> str: return self._tokenizer.cls_token @cls_token.setter def cls_token(self, s): if s != self._tokenizer.cls_token: raise AttributeError("can't set attribute 'cls_token'") @property def sep_token(self) -> str: return self._tokenizer.sep_token @sep_token.setter def sep_token(self, s): if s != self._tokenizer.sep_token: raise AttributeError("can't set attribute 'sep_token'") @property def pad_token(self) -> str: return self._tokenizer.pad_token @pad_token.setter def pad_token(self, s): if s != self._tokenizer.pad_token: raise AttributeError("can't set attribute 'pad_token'") @property def mask_token(self) -> str: return self._tokenizer.mask_token @mask_token.setter def mask_token(self, s): if s != self._tokenizer.mask_token: raise AttributeError("can't set attribute 'mask_token'") @property def bos_token(self) -> str: return self._tokenizer.bos_token @bos_token.setter def bos_token(self, s): if s != self._tokenizer.bos_token: raise AttributeError("can't set attribute 'bos_token'") @property def eos_token(self) -> str: return self._tokenizer.eos_token @eos_token.setter def eos_token(self, s): if s != self._tokenizer.eos_token: raise AttributeError("can't set attribute 'eos_token'") @property def unk_token_id(self) -> str: return self._tokenizer.unk_token_id @property def cls_token_id(self) -> str: return self._tokenizer.cls_token_id @property def sep_token_id(self) -> str: return self._tokenizer.sep_token_id @property def pad_token_id(self) -> str: return self._tokenizer.pad_token_id @property def mask_token_id(self) -> str: return self._tokenizer.mask_token_id @property def bos_token_id(self) -> str: return self._tokenizer.bos_token_id @property def eos_token_id(self) -> str: return self._tokenizer.eos_token_id def _batch_encode_plus( self, batch_text_or_text_pairs: Union[ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair], List[EncodedInput], List[EncodedInputPair], ], add_special_tokens: bool = True, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names if not isinstance(batch_text_or_text_pairs, (list, tuple)): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded( batch_text_or_text_pairs, add_special_tokens, return_token_type_ids, return_attention_mask, return_offsets_mapping, return_as_list=(return_tensors is None), padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, ) data = dict(input_ids=input_ids) if return_attention_mask: data['attention_mask'] = attention_mask if return_token_type_ids: data['token_type_ids'] = token_type_ids if return_offsets_mapping: data['offset_mapping'] = offset_mapping for i in input_ids: self._eventual_warn_about_too_long_sequence(i, max_length, verbose) return BatchEncoding(data, tensor_type=return_tensors) def _encode_plus( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names text = text if text_pair is None else (text, text_pair) input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded( [text], add_special_tokens, return_token_type_ids, return_attention_mask, return_offsets_mapping, return_as_list=(return_tensors is None), padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, ) if return_tensors is None and not return_overflowing_tokens: input_ids = input_ids[0] if return_attention_mask: attention_mask = attention_mask[0] if return_token_type_ids: token_type_ids = token_type_ids[0] if return_offsets_mapping: offset_mapping = offset_mapping[0] self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose) else: self._eventual_warn_about_too_long_sequence(input_ids[0], max_length, verbose) data = dict(input_ids=input_ids) if return_attention_mask: data['attention_mask'] = attention_mask if return_token_type_ids: data['token_type_ids'] = token_type_ids if return_offsets_mapping: data['offset_mapping'] = offset_mapping return BatchEncoding(data, tensor_type=return_tensors) def _make_encoded( self, batch_text_or_text_pairs, add_special_tokens, return_token_type_ids, return_attention_mask, return_offsets_mapping, return_as_list = False, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, ): input_ids = [] attention_mask = [] token_type_ids = [] offset_mapping = [] if isinstance(batch_text_or_text_pairs[0], str): # single special_token_size = self.num_special_tokens_to_add(False) if add_special_tokens else 0 for i in self._tokenizer.encode(batch_text_or_text_pairs, return_offsets=return_offsets_mapping): if return_offsets_mapping: i, i_offset = i i_offset = i_offset.astype(np.int64) i = i.astype(np.int64) if (truncation_strategy in (TruncationStrategy.LONGEST_FIRST, TruncationStrategy.ONLY_FIRST) and len(i) > max_length - special_token_size): i = i[:max_length - special_token_size] if return_offsets_mapping: i_offset = i_offset[:max_length - special_token_size] if add_special_tokens and self._post_processor == 'bert': i = np.pad(i.astype(np.int64), (1, 1)) i[0] = self._tokenizer.cls_token_id i[-1] = self._tokenizer.sep_token_id if return_offsets_mapping: i_offset = np.pad(i_offset, ((1, 1), (0, 0))) input_ids.append(i) if return_attention_mask: attention_mask.append(np.ones_like(i)) if return_token_type_ids: token_type_ids.append(np.zeros_like(i)) if return_offsets_mapping: offset_mapping.append(i_offset) else: # pair special_token_size = self.num_special_tokens_to_add(True) if add_special_tokens else 0 for i, j in _group_by_two(self._tokenizer.encode(itertools.chain.from_iterable(batch_text_or_text_pairs), return_offsets=return_offsets_mapping)): if return_offsets_mapping: i, i_offset = i j, j_offset = j i_offset = i_offset.astype(np.int64) j_offset = j_offset.astype(np.int64) if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and len(i) + len(j) > max_length - special_token_size): if truncation_strategy == TruncationStrategy.LONGEST_FIRST: t = len(i) + len(j) - (max_length - special_token_size) d = abs(len(i) - len(j)) trunc_size = min(d, t) if len(i) > len(j): i = i[:-trunc_size] if return_offsets_mapping: i_offset = i_offset[:-trunc_size] else: j = j[:-trunc_size] if return_offsets_mapping: j_offset = j_offset[:-trunc_size] if t > d: i = i[:-((t - d + 1) // 2)] j = j[:-((t - d) // 2)] if return_offsets_mapping: i_offset = i_offset[:-((t - d + 1) // 2)] j_offset = j_offset[:-((t - d) // 2)] elif truncation_strategy == TruncationStrategy.ONLY_FIRST: i = i[:max(max_length - special_token_size - len(j), 0)] if return_offsets_mapping: i_offset = i_offset[:max(max_length - special_token_size - len(j), 0)] elif truncation_strategy == TruncationStrategy.ONLY_SECOND: j = j[:max(max_length - special_token_size - len(i), 0)] if return_offsets_mapping: j_offset = j_offset[:max(max_length - special_token_size - len(i), 0)] if add_special_tokens and self._post_processor == 'bert': c = np.concatenate([[self._tokenizer.cls_token_id], i, [self._tokenizer.sep_token_id], j, [self._tokenizer.sep_token_id]]) t = (np.arange(len(c)) >= len(i) + 2).astype(np.int64) if return_offsets_mapping: c_offset = np.concatenate([np.pad(i_offset, ((1, 1), (0, 0))), np.pad(j_offset, ((0, 1), (0, 0)))], axis=0) else: c = np.concatenate([i, j]) t = (np.arange(len(c)) >= len(i)).astype(np.int64) if return_offsets_mapping: c_offset = np.concatenate([i_offset, j_offset], axis=0) input_ids.append(c) if return_attention_mask: attention_mask.append(np.ones_like(c)) if return_token_type_ids: token_type_ids.append(t) if return_offsets_mapping: offset_mapping.append(c_offset) if padding_strategy == PaddingStrategy.LONGEST: final_length = max(map(len, input_ids)) if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of elif padding_strategy == PaddingStrategy.MAX_LENGTH: final_length = max(max(map(len, input_ids)), max_length or 0) if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of else: final_length = None if final_length: for i in range(len(input_ids)): input_ids[i] = np.pad(input_ids[i], (0, final_length - len(input_ids[i])), constant_values=self._tokenizer.pad_token_id) try: attention_mask[i] = np.pad(attention_mask[i], (0, final_length - len(attention_mask[i]))) except IndexError: pass try: token_type_ids[i] = np.pad(token_type_ids[i], (0, final_length - len(token_type_ids[i]))) except IndexError: pass try: offset_mapping[i] = np.pad(offset_mapping[i], ((0, final_length - len(offset_mapping[i])), (0, 0))) except IndexError: pass if return_as_list: input_ids = list(map(np.ndarray.tolist, input_ids)) attention_mask = list(map(np.ndarray.tolist, attention_mask)) token_type_ids = list(map(np.ndarray.tolist, token_type_ids)) offset_mapping = list(map(lambda x:list(map(tuple, x)), (map(np.ndarray.tolist, offset_mapping)))) return input_ids, attention_mask, token_type_ids, offset_mapping def _decode( self, token_ids, skip_special_tokens = False, clean_up_tokenization_spaces = True, ): if isinstance(token_ids, int): token_ids = [token_ids] if skip_special_tokens: token_ids = [i for i in token_ids if i not in self.all_special_ids] return self._tokenizer.decode(token_ids) def get_added_vocab(self) -> Dict[str, int]: return {} def get_vocab(self): return self._tokenizer.vocab @property def vocab(self): return self._tokenizer.vocab @property def vocab_size(self) -> int: return len(self._tokenizer.vocab) def __len__(self): return len(self._tokenizer) @property def is_fast(self) -> bool: return False def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: if tokens is None: return None if isinstance(tokens, str): return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id) ids = [] for token in tokens: ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id)) return ids def num_special_tokens_to_add(self, pair: bool = False) -> int: if self._post_processor == 'bert': return 3 if pair else 2 return 0 def _add_tokens(self, new_tokens, special_tokens = False) -> int: if all(t in self.vocab for t in new_tokens): return 0 raise NotImplementedError("`KiwiTokenizer.add_tokens` is not support yet.") def tokenize( self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, ) -> List[str]: ids = self.encode(text, pair, add_special_tokens=add_special_tokens) return list(map(self._tokenizer.id2vocab.__getitem__, ids)) def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[str, List[str]]: if isinstance(ids, int): if skip_special_tokens and ids in self.all_special_ids: return '' return self._tokenizer.id2vocab[ids] if skip_special_tokens: ids = [i for i in ids if i not in self.all_special_ids] return list(map(self._tokenizer.id2vocab.__getitem__, ids)) def _save_pretrained( self, save_directory: Union[str, os.PathLike], file_names: Tuple[str], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, ) -> Tuple[str]: save_directory = str(save_directory) if self.slow_tokenizer_class is None and legacy_format is True: raise ValueError( "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You " "might consider leaving the legacy_format at `None` or setting it to `False`." ) tokenizer_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.json" ) self._tokenizer.save(tokenizer_file) file_names = file_names + (tokenizer_file,) return file_names @property def added_tokens_decoder(self): return {}Ancestors- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
 Class variables- var max_model_input_sizes : Dict[str, Optional[int]]
- var model_input_names : List[str]
- var padding_side : str
- var pretrained_init_configuration : Dict[str, Dict[str, Any]]
- var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
- var truncation_side : str
- var vocab_files_names : Dict[str, str]
 Instance variables- var added_tokens_decoder
- 
Expand source code@property def added_tokens_decoder(self): return {}
- var bos_token : str
- 
str: Beginning of sentence token. Log an error if used while not having been set.Expand source code@property def bos_token(self) -> str: return self._tokenizer.bos_token
- var bos_token_id : str
- 
Optional[int]: Id of the beginning of sentence token in the vocabulary. ReturnsNoneif the token has not been set.Expand source code@property def bos_token_id(self) -> str: return self._tokenizer.bos_token_id
- var cls_token : str
- 
str: Classification token, to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set.Expand source code@property def cls_token(self) -> str: return self._tokenizer.cls_token
- var cls_token_id : str
- 
Optional[int]: Id of the classification token in the vocabulary, to extract a summary of an input sequence leveraging self-attention along the full depth of the model.Returns Noneif the token has not been set.Expand source code@property def cls_token_id(self) -> str: return self._tokenizer.cls_token_id
- var eos_token : str
- 
str: End of sentence token. Log an error if used while not having been set.Expand source code@property def eos_token(self) -> str: return self._tokenizer.eos_token
- var eos_token_id : str
- 
Optional[int]: Id of the end of sentence token in the vocabulary. ReturnsNoneif the token has not been set.Expand source code@property def eos_token_id(self) -> str: return self._tokenizer.eos_token_id
- var is_fast : bool
- 
Expand source code@property def is_fast(self) -> bool: return False
- var mask_token : str
- 
str: Mask token, to use when training a model with masked-language modeling. Log an error if used while not having been set.Expand source code@property def mask_token(self) -> str: return self._tokenizer.mask_token
- var mask_token_id : str
- 
Optional[int]: Id of the mask token in the vocabulary, used when training a model with masked-language modeling. ReturnsNoneif the token has not been set.Expand source code@property def mask_token_id(self) -> str: return self._tokenizer.mask_token_id
- var pad_token : str
- 
str: Padding token. Log an error if used while not having been set.Expand source code@property def pad_token(self) -> str: return self._tokenizer.pad_token
- var pad_token_id : str
- 
Optional[int]: Id of the padding token in the vocabulary. ReturnsNoneif the token has not been set.Expand source code@property def pad_token_id(self) -> str: return self._tokenizer.pad_token_id
- var sep_token : str
- 
str: Separation token, to separate context and query in an input sequence. Log an error if used while not having been set.Expand source code@property def sep_token(self) -> str: return self._tokenizer.sep_token
- var sep_token_id : str
- 
Optional[int]: Id of the separation token in the vocabulary, to separate context and query in an input sequence. ReturnsNoneif the token has not been set.Expand source code@property def sep_token_id(self) -> str: return self._tokenizer.sep_token_id
- var unk_token : str
- 
str: Unknown token. Log an error if used while not having been set.Expand source code@property def unk_token(self) -> str: return self._tokenizer.unk_token
- var unk_token_id : str
- 
Optional[int]: Id of the unknown token in the vocabulary. ReturnsNoneif the token has not been set.Expand source code@property def unk_token_id(self) -> str: return self._tokenizer.unk_token_id
- var vocab
- 
Expand source code@property def vocab(self): return self._tokenizer.vocab
- var vocab_size : int
- 
Expand source code@property def vocab_size(self) -> int: return len(self._tokenizer.vocab)
 Methods- def convert_ids_to_tokens(self, ids: Union[int, List[int]], skip_special_tokens: bool = False) ‑> Union[str, List[str]]
- 
Expand source codedef convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[str, List[str]]: if isinstance(ids, int): if skip_special_tokens and ids in self.all_special_ids: return '' return self._tokenizer.id2vocab[ids] if skip_special_tokens: ids = [i for i in ids if i not in self.all_special_ids] return list(map(self._tokenizer.id2vocab.__getitem__, ids))
- def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) ‑> Union[int, List[int]]
- 
Expand source codedef convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: if tokens is None: return None if isinstance(tokens, str): return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id) ids = [] for token in tokens: ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id)) return ids
- def get_added_vocab(self) ‑> Dict[str, int]
- 
Expand source codedef get_added_vocab(self) -> Dict[str, int]: return {}
- def get_vocab(self)
- 
Returns the vocabulary as a dictionary of token to index. tokenizer.get_vocab()[token]is equivalent totokenizer.convert_tokens_to_ids(token)whentokenis in the vocab.ReturnsDict[str, int]: The vocabulary.Expand source codedef get_vocab(self): return self._tokenizer.vocab
- def num_special_tokens_to_add(self, pair: bool = False) ‑> int
- 
Expand source codedef num_special_tokens_to_add(self, pair: bool = False) -> int: if self._post_processor == 'bert': return 3 if pair else 2 return 0
- def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) ‑> List[str]
- 
Converts a string into a sequence of tokens, replacing unknown tokens with the unk_token.Argstext ( str): The sequence to be encoded. pair (str, optional): A second sequence to be encoded with the first. add_special_tokens (bool, optional, defaults toFalse): Whether or not to add the special tokens associated with the corresponding model. kwargs (additional keyword arguments, optional): Will be passed to the underlying model specific encode method. See details in [~PreTrainedTokenizerBase.__call__]ReturnsList[str]: The list of tokens.Expand source codedef tokenize( self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, ) -> List[str]: ids = self.encode(text, pair, add_special_tokens=add_special_tokens) return list(map(self._tokenizer.id2vocab.__getitem__, ids))