Module `kiwipiepy.transformers_addon`

Added in version: 0.15.1

transformers_addon 모듈은 Kiwi의 SwTokenizer를 huggingface transformers의 tokenizer와 호환이 가능하도록 래핑한 KiwiTokenizer 클래스를 제공합니다.

이 기능을 사용하기 위해서는 transformers>=4.12 이 필요합니다.

from transformers import AutoTokenizer

import kiwipiepy.transformers_addon
# kiwipiepy.transformers_addon를 import하면
# 자동으로 KiwiTokenizer가 AutoTokenizer에 register됩니다.
# KiwiTokenizer는 huggingface transformers의 토크나이저와 대부분의 기능이 호환되므로
# 기존 transformers 기반의 코드를 그대로 재사용할 수 있습니다.

# SwTokenizer가 some_path/tokenizer.json에 저장되어있다고 가정
tokenizer = AutoTokenizer.from_pretrained('some_path')
tokenizer.encode("한국어를 고려한 토크나이저!")

KiwiTokenizer의 주요 기능들은 transformers와 대부분 호환됩니다만, 다음 기능들은 현재 호환되지 않습니다.

add_tokens, add_special_tokens 등 새 토큰을 추가하는 기능
encode_plus의 stride, is_split_into_words, return_overflowing_tokens, return_special_tokens_mask, return_length 인자

Expand source code

'''
.. versionadded:: 0.15.1

`transformers_addon` 모듈은 Kiwi의 SwTokenizer를 
[huggingface transformers의 tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer)와 
호환이 가능하도록 래핑한 `KiwiTokenizer` 클래스를 제공합니다.

이 기능을 사용하기 위해서는 `transformers>=4.12` 이 필요합니다.

```python
from transformers import AutoTokenizer

import kiwipiepy.transformers_addon
# kiwipiepy.transformers_addon를 import하면
# 자동으로 KiwiTokenizer가 AutoTokenizer에 register됩니다.
# KiwiTokenizer는 huggingface transformers의 토크나이저와 대부분의 기능이 호환되므로
# 기존 transformers 기반의 코드를 그대로 재사용할 수 있습니다.

# SwTokenizer가 some_path/tokenizer.json에 저장되어있다고 가정
tokenizer = AutoTokenizer.from_pretrained('some_path')
tokenizer.encode("한국어를 고려한 토크나이저!")
```

`KiwiTokenizer`의 주요 기능들은 transformers와 대부분 호환됩니다만, 다음 기능들은 현재 호환되지 않습니다.

* `add_tokens`, `add_special_tokens` 등 새 토큰을 추가하는 기능
* `encode_plus`의  `stride`, `is_split_into_words`, `return_overflowing_tokens`, `return_special_tokens_mask`, `return_length` 인자
'''


import os
import itertools
from typing import Union, List, Optional, Dict, Tuple

import numpy as np

from transformers import AutoTokenizer

from transformers.tokenization_utils_base import (
    PreTrainedTokenizerBase, 
    TextInput, 
    TextInputPair, 
    PreTokenizedInput, 
    PreTokenizedInputPair, 
    EncodedInput, 
    EncodedInputPair,
    PaddingStrategy,
    TruncationStrategy,
    TensorType,
    BatchEncoding, 
)

from kiwipiepy.sw_tokenizer import SwTokenizer, SwTokenizerConfig

def _group_by_two(iterator):
    try:
        while True:
            a = next(iterator)
            b = next(iterator)
            yield a, b
    except StopIteration:
        pass

class KiwiTokenizer(PreTrainedTokenizerBase):

    vocab_files_names = {"tokenizer_file": "tokenizer.json"}

    def __init__(self, tokenizer_file=None, **kwargs):
        super().__init__(**kwargs)
        if tokenizer_file is None:
            raise ValueError(f"Cannot instantiate tokenizer from {tokenizer_file!r}")
        
        self._tokenizer = SwTokenizer(tokenizer_file)
        self._post_processor = self._tokenizer.config.additional.get('post_processor') if isinstance(self._tokenizer.config.additional, dict) else None
        if self._post_processor not in (None, 'bert'):
            raise ValueError(f"Unknown post_processor `{self._post_processor!r}`")

        self._bos_token = self._tokenizer.bos_token
        self._eos_token = self._tokenizer.eos_token
        self._unk_token = self._tokenizer.unk_token
        self._sep_token = self._tokenizer.sep_token
        self._pad_token = self._tokenizer.pad_token
        self._cls_token = self._tokenizer.cls_token
        self._mask_token = self._tokenizer.mask_token
    
    @property
    def unk_token(self) -> str:
        return self._tokenizer.unk_token
    
    @unk_token.setter
    def unk_token(self, s):
        if s != self._tokenizer.unk_token:
            raise AttributeError("can't set attribute 'unk_token'")

    @property
    def cls_token(self) -> str:
        return self._tokenizer.cls_token

    @cls_token.setter
    def cls_token(self, s):
        if s != self._tokenizer.cls_token:
            raise AttributeError("can't set attribute 'cls_token'")

    @property
    def sep_token(self) -> str:
        return self._tokenizer.sep_token
    
    @sep_token.setter
    def sep_token(self, s):
        if s != self._tokenizer.sep_token:
            raise AttributeError("can't set attribute 'sep_token'")

    @property
    def pad_token(self) -> str:
        return self._tokenizer.pad_token

    @pad_token.setter
    def pad_token(self, s):
        if s != self._tokenizer.pad_token:
            raise AttributeError("can't set attribute 'pad_token'")

    @property
    def mask_token(self) -> str:
        return self._tokenizer.mask_token
    
    @mask_token.setter
    def mask_token(self, s):
        if s != self._tokenizer.mask_token:
            raise AttributeError("can't set attribute 'mask_token'")

    @property
    def bos_token(self) -> str:
        return self._tokenizer.bos_token
    
    @bos_token.setter
    def bos_token(self, s):
        if s != self._tokenizer.bos_token:
            raise AttributeError("can't set attribute 'bos_token'")

    @property
    def eos_token(self) -> str:
        return self._tokenizer.eos_token

    @eos_token.setter
    def eos_token(self, s):
        if s != self._tokenizer.eos_token:
            raise AttributeError("can't set attribute 'eos_token'")

    @property
    def unk_token_id(self) -> str:
        return self._tokenizer.unk_token_id

    @property
    def cls_token_id(self) -> str:
        return self._tokenizer.cls_token_id

    @property
    def sep_token_id(self) -> str:
        return self._tokenizer.sep_token_id
    
    @property
    def pad_token_id(self) -> str:
        return self._tokenizer.pad_token_id

    @property
    def mask_token_id(self) -> str:
        return self._tokenizer.mask_token_id
    
    @property
    def bos_token_id(self) -> str:
        return self._tokenizer.bos_token_id
    
    @property
    def eos_token_id(self) -> str:
        return self._tokenizer.eos_token_id

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        if not isinstance(batch_text_or_text_pairs, (list, tuple)):
            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")

        input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
            batch_text_or_text_pairs, add_special_tokens, 
            return_token_type_ids, return_attention_mask, return_offsets_mapping,
            return_as_list=(return_tensors is None),
            padding_strategy=padding_strategy, 
            truncation_strategy=truncation_strategy, 
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
        )
        data = dict(input_ids=input_ids)
        if return_attention_mask: data['attention_mask'] = attention_mask
        if return_token_type_ids: data['token_type_ids'] = token_type_ids
        if return_offsets_mapping: data['offset_mapping'] = offset_mapping

        for i in input_ids:
            self._eventual_warn_about_too_long_sequence(i, max_length, verbose)
        return BatchEncoding(data, tensor_type=return_tensors)

    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        text = text if text_pair is None else (text, text_pair)
        input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
            [text], add_special_tokens, 
            return_token_type_ids, return_attention_mask, return_offsets_mapping,
            return_as_list=(return_tensors is None),
            padding_strategy=padding_strategy, 
            truncation_strategy=truncation_strategy, 
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
        )

        if return_tensors is None and not return_overflowing_tokens:
            input_ids = input_ids[0]
            if return_attention_mask: attention_mask = attention_mask[0]
            if return_token_type_ids: token_type_ids = token_type_ids[0]
            if return_offsets_mapping: offset_mapping = offset_mapping[0]
            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
        else:
            self._eventual_warn_about_too_long_sequence(input_ids[0], max_length, verbose)

        data = dict(input_ids=input_ids)
        if return_attention_mask: data['attention_mask'] = attention_mask
        if return_token_type_ids: data['token_type_ids'] = token_type_ids
        if return_offsets_mapping: data['offset_mapping'] = offset_mapping
            
        return BatchEncoding(data, tensor_type=return_tensors)

    def _make_encoded(
        self, 
        batch_text_or_text_pairs, 
        add_special_tokens, 
        return_token_type_ids, 
        return_attention_mask,
        return_offsets_mapping,
        return_as_list = False,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
    ):
        input_ids = []
        attention_mask = []
        token_type_ids = []
        offset_mapping = []
        if isinstance(batch_text_or_text_pairs[0], str): # single
            special_token_size = self.num_special_tokens_to_add(False) if add_special_tokens else 0

            for i in self._tokenizer.encode(batch_text_or_text_pairs, return_offsets=return_offsets_mapping):
                if return_offsets_mapping:
                    i, i_offset = i
                    i_offset = i_offset.astype(np.int64)
                i = i.astype(np.int64)
                if (truncation_strategy in (TruncationStrategy.LONGEST_FIRST, TruncationStrategy.ONLY_FIRST) 
                    and len(i) > max_length - special_token_size):
                    i = i[:max_length - special_token_size]
                    if return_offsets_mapping:
                        i_offset = i_offset[:max_length - special_token_size]

                if add_special_tokens and self._post_processor == 'bert':
                    i = np.pad(i.astype(np.int64), (1, 1))
                    i[0] = self._tokenizer.cls_token_id
                    i[-1] = self._tokenizer.sep_token_id
                    if return_offsets_mapping:
                        i_offset = np.pad(i_offset, ((1, 1), (0, 0)))
                input_ids.append(i)
                if return_attention_mask: attention_mask.append(np.ones_like(i))
                if return_token_type_ids: token_type_ids.append(np.zeros_like(i))
                if return_offsets_mapping: offset_mapping.append(i_offset)
        
        else: # pair
            special_token_size = self.num_special_tokens_to_add(True) if add_special_tokens else 0

            for i, j in _group_by_two(self._tokenizer.encode(itertools.chain.from_iterable(batch_text_or_text_pairs), return_offsets=return_offsets_mapping)):
                if return_offsets_mapping:
                    i, i_offset = i
                    j, j_offset = j
                    i_offset = i_offset.astype(np.int64)
                    j_offset = j_offset.astype(np.int64)
                if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
                    and len(i) + len(j) > max_length - special_token_size):
                    if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
                        t = len(i) + len(j) - (max_length - special_token_size)
                        d = abs(len(i) - len(j))
                        trunc_size = min(d, t)
                        if len(i) > len(j):
                            i = i[:-trunc_size]
                            if return_offsets_mapping:
                                i_offset = i_offset[:-trunc_size]
                        else:
                            j = j[:-trunc_size]
                            if return_offsets_mapping:
                                j_offset = j_offset[:-trunc_size]
                        
                        if t > d:
                            i = i[:-((t - d + 1) // 2)]
                            j = j[:-((t - d) // 2)]
                            if return_offsets_mapping:
                                i_offset = i_offset[:-((t - d + 1) // 2)]
                                j_offset = j_offset[:-((t - d) // 2)]
                    elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
                        i = i[:max(max_length - special_token_size - len(j), 0)]
                        if return_offsets_mapping:
                            i_offset = i_offset[:max(max_length - special_token_size - len(j), 0)]
                    elif truncation_strategy == TruncationStrategy.ONLY_SECOND:
                        j = j[:max(max_length - special_token_size - len(i), 0)]
                        if return_offsets_mapping:
                            j_offset = j_offset[:max(max_length - special_token_size - len(i), 0)]

                if add_special_tokens and self._post_processor == 'bert':
                    c = np.concatenate([[self._tokenizer.cls_token_id], i, [self._tokenizer.sep_token_id], j, [self._tokenizer.sep_token_id]])
                    t = (np.arange(len(c)) >= len(i) + 2).astype(np.int64)
                    if return_offsets_mapping:
                        c_offset = np.concatenate([np.pad(i_offset, ((1, 1), (0, 0))), np.pad(j_offset, ((0, 1), (0, 0)))], axis=0)
                else:
                    c = np.concatenate([i, j])
                    t = (np.arange(len(c)) >= len(i)).astype(np.int64)
                    if return_offsets_mapping:
                        c_offset = np.concatenate([i_offset, j_offset], axis=0)
                input_ids.append(c)
                if return_attention_mask: attention_mask.append(np.ones_like(c))
                if return_token_type_ids: token_type_ids.append(t)
                if return_offsets_mapping: offset_mapping.append(c_offset)

        if padding_strategy == PaddingStrategy.LONGEST:
            final_length = max(map(len, input_ids))
            if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
        elif padding_strategy == PaddingStrategy.MAX_LENGTH:
            final_length = max(max(map(len, input_ids)), max_length or 0)
            if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
        else:
            final_length = None
        
        if final_length:
            for i in range(len(input_ids)):
                input_ids[i] = np.pad(input_ids[i], (0, final_length - len(input_ids[i])), constant_values=self._tokenizer.pad_token_id)
                
                try: attention_mask[i] = np.pad(attention_mask[i], (0, final_length - len(attention_mask[i])))
                except IndexError: pass
                
                try: token_type_ids[i] = np.pad(token_type_ids[i], (0, final_length - len(token_type_ids[i])))
                except IndexError: pass
                
                try: offset_mapping[i] = np.pad(offset_mapping[i], ((0, final_length - len(offset_mapping[i])), (0, 0)))
                except IndexError: pass

        if return_as_list:
            input_ids = list(map(np.ndarray.tolist, input_ids))
            attention_mask = list(map(np.ndarray.tolist, attention_mask))
            token_type_ids = list(map(np.ndarray.tolist, token_type_ids))
            offset_mapping = list(map(lambda x:list(map(tuple, x)), (map(np.ndarray.tolist, offset_mapping))))

        return input_ids, attention_mask, token_type_ids, offset_mapping

    def _decode(
        self, 
        token_ids, 
        skip_special_tokens = False, 
        clean_up_tokenization_spaces = True,
    ):
        if isinstance(token_ids, int): token_ids = [token_ids]
        if skip_special_tokens:
            token_ids = [i for i in token_ids if i not in self.all_special_ids]
        return self._tokenizer.decode(token_ids)

    def get_added_vocab(self) -> Dict[str, int]:
        return {}
    
    def get_vocab(self):
        return self._tokenizer.vocab
    
    @property
    def vocab(self):
        return self._tokenizer.vocab
    
    @property
    def vocab_size(self) -> int:
        return len(self._tokenizer.vocab)

    def __len__(self):
        return len(self._tokenizer)

    @property
    def is_fast(self) -> bool:
        return False

    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        if tokens is None:
            return None

        if isinstance(tokens, str):
            return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id)

        ids = []
        for token in tokens:
            ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id))
        return ids

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        if self._post_processor == 'bert':
            return 3 if pair else 2
        return 0

    def _add_tokens(self, new_tokens, special_tokens = False) -> int:
        if all(t in self.vocab for t in new_tokens):
            return 0
        raise NotImplementedError("`KiwiTokenizer.add_tokens` is not support yet.")

    def tokenize(
        self, 
        text: str, 
        pair: Optional[str] = None, 
        add_special_tokens: bool = False,
    ) -> List[str]:
        ids = self.encode(text, pair, add_special_tokens=add_special_tokens)
        return list(map(self._tokenizer.id2vocab.__getitem__, ids))

    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        if isinstance(ids, int):
            if skip_special_tokens and ids in self.all_special_ids: return ''
            return self._tokenizer.id2vocab[ids]

        if skip_special_tokens:
            ids = [i for i in ids if i not in self.all_special_ids]
        return list(map(self._tokenizer.id2vocab.__getitem__, ids))

    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
        save_directory = str(save_directory)

        if self.slow_tokenizer_class is None and legacy_format is True:
            raise ValueError(
                "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You "
                "might consider leaving the legacy_format at `None` or setting it to `False`."
            )

        tokenizer_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.json"
        )
        self._tokenizer.save(tokenizer_file)
        file_names = file_names + (tokenizer_file,)

        return file_names

AutoTokenizer.register('KiwiTokenizer', None, KiwiTokenizer)

Classes

class KiwiTokenizer (tokenizer_file=None, **kwargs)

Base class for [PreTrainedTokenizer] and [PreTrainedTokenizerFast].

Handles shared (mostly boiler plate) methods for those two classes.

Class attributes (overridden by derived classes)

- **vocab_files_names** (<code>Dict\[str, str]</code>) -- A dictionary with, as keys, the <code>\_\_init\_\_</code> keyword name of each
  vocabulary file required by the model, and as associated values, the filename for saving the associated file
  (string).
- **pretrained_vocab_files_map** (<code>Dict\[str, Dict\[str, str]]</code>) -- A dictionary of dictionaries, with the
  high-level keys being the <code>\_\_init\_\_</code> keyword name of each vocabulary file required by the model, the
  low-level being the `short-cut-names` of the pretrained models with, as associated values, the <code>url</code> to the
  associated pretrained vocabulary file.
- **max_model_input_sizes** (<code>Dict\[str, Optional\[int]]</code>) -- A dictionary with, as keys, the `short-cut-names`
  of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
  or <code>None</code> if the model has no maximum input size.
- **pretrained_init_configuration** (<code>Dict\[str, Dict\[str, Any]]</code>) -- A dictionary with, as keys, the
  `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
  pass to the <code>\_\_init\_\_</code> method of the tokenizer class for this pretrained model when loading the tokenizer
  with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`] method.
- **model_input_names** (<code>List\[str]</code>) -- A list of inputs expected in the forward pass of the model.
- **padding_side** (<code>str</code>) -- The default value for the side on which the model should have padding applied.
  Should be `'right'` or `'left'`.
- **truncation_side** (<code>str</code>) -- The default value for the side on which the model should have truncation
  applied. Should be `'right'` or `'left'`.

Args

model_max_length (int, optional): The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is loaded with [~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained], this will be set to the value stored for the associated model in max_model_input_sizes (see above). If no value is provided, will default to VERY_LARGE_INTEGER (int(1e30)). padding_side (str, optional): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. truncation_side (str, optional): The side on which the model should have truncation applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. model_input_names (List[string], optional): The list of inputs accepted by the forward pass of the model (like "token_type_ids" or "attention_mask"). Default value is picked from the class attribute of the same name. bos_token (str or tokenizers.AddedToken, optional): A special token representing the beginning of a sentence. Will be associated to self.bos_token and self.bos_token_id. eos_token (str or tokenizers.AddedToken, optional): A special token representing the end of a sentence. Will be associated to self.eos_token and self.eos_token_id. unk_token (str or tokenizers.AddedToken, optional): A special token representing an out-of-vocabulary token. Will be associated to self.unk_token and self.unk_token_id. sep_token (str or tokenizers.AddedToken, optional): A special token separating two different sentences in the same input (used by BERT for instance). Will be associated to self.sep_token and self.sep_token_id. pad_token (str or tokenizers.AddedToken, optional): A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. Will be associated to self.pad_token and self.pad_token_id. cls_token (str or tokenizers.AddedToken, optional): A special token representing the class of the input (used by BERT for instance). Will be associated to self.cls_token and self.cls_token_id. mask_token (str or tokenizers.AddedToken, optional): A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). Will be associated to self.mask_token and self.mask_token_id. additional_special_tokens (tuple or list of str or tokenizers.AddedToken, optional): A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the tokenization process. Will be associated to self.additional_special_tokens and self.additional_special_tokens_ids. clean_up_tokenization_spaces (bool, optional, defaults to True): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. split_special_tokens (bool, optional, defaults to False): Whether or not the special tokens should be split during the tokenization process. The default behavior is to not split special tokens. This means that if <s> is the bos_token, then tokenizer.tokenize("<s>") = ['<s>]. Otherwise, if split_special_tokens=True<code>, then </code>tokenizer.tokenize("<s>")<code> will be give </code>['<', 's', '>']<code>. This argument is only supported for </code>slow tokenizers for the moment.

Expand source code

class KiwiTokenizer(PreTrainedTokenizerBase):

    vocab_files_names = {"tokenizer_file": "tokenizer.json"}

    def __init__(self, tokenizer_file=None, **kwargs):
        super().__init__(**kwargs)
        if tokenizer_file is None:
            raise ValueError(f"Cannot instantiate tokenizer from {tokenizer_file!r}")
        
        self._tokenizer = SwTokenizer(tokenizer_file)
        self._post_processor = self._tokenizer.config.additional.get('post_processor') if isinstance(self._tokenizer.config.additional, dict) else None
        if self._post_processor not in (None, 'bert'):
            raise ValueError(f"Unknown post_processor `{self._post_processor!r}`")

        self._bos_token = self._tokenizer.bos_token
        self._eos_token = self._tokenizer.eos_token
        self._unk_token = self._tokenizer.unk_token
        self._sep_token = self._tokenizer.sep_token
        self._pad_token = self._tokenizer.pad_token
        self._cls_token = self._tokenizer.cls_token
        self._mask_token = self._tokenizer.mask_token
    
    @property
    def unk_token(self) -> str:
        return self._tokenizer.unk_token
    
    @unk_token.setter
    def unk_token(self, s):
        if s != self._tokenizer.unk_token:
            raise AttributeError("can't set attribute 'unk_token'")

    @property
    def cls_token(self) -> str:
        return self._tokenizer.cls_token

    @cls_token.setter
    def cls_token(self, s):
        if s != self._tokenizer.cls_token:
            raise AttributeError("can't set attribute 'cls_token'")

    @property
    def sep_token(self) -> str:
        return self._tokenizer.sep_token
    
    @sep_token.setter
    def sep_token(self, s):
        if s != self._tokenizer.sep_token:
            raise AttributeError("can't set attribute 'sep_token'")

    @property
    def pad_token(self) -> str:
        return self._tokenizer.pad_token

    @pad_token.setter
    def pad_token(self, s):
        if s != self._tokenizer.pad_token:
            raise AttributeError("can't set attribute 'pad_token'")

    @property
    def mask_token(self) -> str:
        return self._tokenizer.mask_token
    
    @mask_token.setter
    def mask_token(self, s):
        if s != self._tokenizer.mask_token:
            raise AttributeError("can't set attribute 'mask_token'")

    @property
    def bos_token(self) -> str:
        return self._tokenizer.bos_token
    
    @bos_token.setter
    def bos_token(self, s):
        if s != self._tokenizer.bos_token:
            raise AttributeError("can't set attribute 'bos_token'")

    @property
    def eos_token(self) -> str:
        return self._tokenizer.eos_token

    @eos_token.setter
    def eos_token(self, s):
        if s != self._tokenizer.eos_token:
            raise AttributeError("can't set attribute 'eos_token'")

    @property
    def unk_token_id(self) -> str:
        return self._tokenizer.unk_token_id

    @property
    def cls_token_id(self) -> str:
        return self._tokenizer.cls_token_id

    @property
    def sep_token_id(self) -> str:
        return self._tokenizer.sep_token_id
    
    @property
    def pad_token_id(self) -> str:
        return self._tokenizer.pad_token_id

    @property
    def mask_token_id(self) -> str:
        return self._tokenizer.mask_token_id
    
    @property
    def bos_token_id(self) -> str:
        return self._tokenizer.bos_token_id
    
    @property
    def eos_token_id(self) -> str:
        return self._tokenizer.eos_token_id

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        if not isinstance(batch_text_or_text_pairs, (list, tuple)):
            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")

        input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
            batch_text_or_text_pairs, add_special_tokens, 
            return_token_type_ids, return_attention_mask, return_offsets_mapping,
            return_as_list=(return_tensors is None),
            padding_strategy=padding_strategy, 
            truncation_strategy=truncation_strategy, 
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
        )
        data = dict(input_ids=input_ids)
        if return_attention_mask: data['attention_mask'] = attention_mask
        if return_token_type_ids: data['token_type_ids'] = token_type_ids
        if return_offsets_mapping: data['offset_mapping'] = offset_mapping

        for i in input_ids:
            self._eventual_warn_about_too_long_sequence(i, max_length, verbose)
        return BatchEncoding(data, tensor_type=return_tensors)

    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        text = text if text_pair is None else (text, text_pair)
        input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
            [text], add_special_tokens, 
            return_token_type_ids, return_attention_mask, return_offsets_mapping,
            return_as_list=(return_tensors is None),
            padding_strategy=padding_strategy, 
            truncation_strategy=truncation_strategy, 
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
        )

        if return_tensors is None and not return_overflowing_tokens:
            input_ids = input_ids[0]
            if return_attention_mask: attention_mask = attention_mask[0]
            if return_token_type_ids: token_type_ids = token_type_ids[0]
            if return_offsets_mapping: offset_mapping = offset_mapping[0]
            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
        else:
            self._eventual_warn_about_too_long_sequence(input_ids[0], max_length, verbose)

        data = dict(input_ids=input_ids)
        if return_attention_mask: data['attention_mask'] = attention_mask
        if return_token_type_ids: data['token_type_ids'] = token_type_ids
        if return_offsets_mapping: data['offset_mapping'] = offset_mapping
            
        return BatchEncoding(data, tensor_type=return_tensors)

    def _make_encoded(
        self, 
        batch_text_or_text_pairs, 
        add_special_tokens, 
        return_token_type_ids, 
        return_attention_mask,
        return_offsets_mapping,
        return_as_list = False,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
    ):
        input_ids = []
        attention_mask = []
        token_type_ids = []
        offset_mapping = []
        if isinstance(batch_text_or_text_pairs[0], str): # single
            special_token_size = self.num_special_tokens_to_add(False) if add_special_tokens else 0

            for i in self._tokenizer.encode(batch_text_or_text_pairs, return_offsets=return_offsets_mapping):
                if return_offsets_mapping:
                    i, i_offset = i
                    i_offset = i_offset.astype(np.int64)
                i = i.astype(np.int64)
                if (truncation_strategy in (TruncationStrategy.LONGEST_FIRST, TruncationStrategy.ONLY_FIRST) 
                    and len(i) > max_length - special_token_size):
                    i = i[:max_length - special_token_size]
                    if return_offsets_mapping:
                        i_offset = i_offset[:max_length - special_token_size]

                if add_special_tokens and self._post_processor == 'bert':
                    i = np.pad(i.astype(np.int64), (1, 1))
                    i[0] = self._tokenizer.cls_token_id
                    i[-1] = self._tokenizer.sep_token_id
                    if return_offsets_mapping:
                        i_offset = np.pad(i_offset, ((1, 1), (0, 0)))
                input_ids.append(i)
                if return_attention_mask: attention_mask.append(np.ones_like(i))
                if return_token_type_ids: token_type_ids.append(np.zeros_like(i))
                if return_offsets_mapping: offset_mapping.append(i_offset)
        
        else: # pair
            special_token_size = self.num_special_tokens_to_add(True) if add_special_tokens else 0

            for i, j in _group_by_two(self._tokenizer.encode(itertools.chain.from_iterable(batch_text_or_text_pairs), return_offsets=return_offsets_mapping)):
                if return_offsets_mapping:
                    i, i_offset = i
                    j, j_offset = j
                    i_offset = i_offset.astype(np.int64)
                    j_offset = j_offset.astype(np.int64)
                if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
                    and len(i) + len(j) > max_length - special_token_size):
                    if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
                        t = len(i) + len(j) - (max_length - special_token_size)
                        d = abs(len(i) - len(j))
                        trunc_size = min(d, t)
                        if len(i) > len(j):
                            i = i[:-trunc_size]
                            if return_offsets_mapping:
                                i_offset = i_offset[:-trunc_size]
                        else:
                            j = j[:-trunc_size]
                            if return_offsets_mapping:
                                j_offset = j_offset[:-trunc_size]
                        
                        if t > d:
                            i = i[:-((t - d + 1) // 2)]
                            j = j[:-((t - d) // 2)]
                            if return_offsets_mapping:
                                i_offset = i_offset[:-((t - d + 1) // 2)]
                                j_offset = j_offset[:-((t - d) // 2)]
                    elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
                        i = i[:max(max_length - special_token_size - len(j), 0)]
                        if return_offsets_mapping:
                            i_offset = i_offset[:max(max_length - special_token_size - len(j), 0)]
                    elif truncation_strategy == TruncationStrategy.ONLY_SECOND:
                        j = j[:max(max_length - special_token_size - len(i), 0)]
                        if return_offsets_mapping:
                            j_offset = j_offset[:max(max_length - special_token_size - len(i), 0)]

                if add_special_tokens and self._post_processor == 'bert':
                    c = np.concatenate([[self._tokenizer.cls_token_id], i, [self._tokenizer.sep_token_id], j, [self._tokenizer.sep_token_id]])
                    t = (np.arange(len(c)) >= len(i) + 2).astype(np.int64)
                    if return_offsets_mapping:
                        c_offset = np.concatenate([np.pad(i_offset, ((1, 1), (0, 0))), np.pad(j_offset, ((0, 1), (0, 0)))], axis=0)
                else:
                    c = np.concatenate([i, j])
                    t = (np.arange(len(c)) >= len(i)).astype(np.int64)
                    if return_offsets_mapping:
                        c_offset = np.concatenate([i_offset, j_offset], axis=0)
                input_ids.append(c)
                if return_attention_mask: attention_mask.append(np.ones_like(c))
                if return_token_type_ids: token_type_ids.append(t)
                if return_offsets_mapping: offset_mapping.append(c_offset)

        if padding_strategy == PaddingStrategy.LONGEST:
            final_length = max(map(len, input_ids))
            if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
        elif padding_strategy == PaddingStrategy.MAX_LENGTH:
            final_length = max(max(map(len, input_ids)), max_length or 0)
            if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
        else:
            final_length = None
        
        if final_length:
            for i in range(len(input_ids)):
                input_ids[i] = np.pad(input_ids[i], (0, final_length - len(input_ids[i])), constant_values=self._tokenizer.pad_token_id)
                
                try: attention_mask[i] = np.pad(attention_mask[i], (0, final_length - len(attention_mask[i])))
                except IndexError: pass
                
                try: token_type_ids[i] = np.pad(token_type_ids[i], (0, final_length - len(token_type_ids[i])))
                except IndexError: pass
                
                try: offset_mapping[i] = np.pad(offset_mapping[i], ((0, final_length - len(offset_mapping[i])), (0, 0)))
                except IndexError: pass

        if return_as_list:
            input_ids = list(map(np.ndarray.tolist, input_ids))
            attention_mask = list(map(np.ndarray.tolist, attention_mask))
            token_type_ids = list(map(np.ndarray.tolist, token_type_ids))
            offset_mapping = list(map(lambda x:list(map(tuple, x)), (map(np.ndarray.tolist, offset_mapping))))

        return input_ids, attention_mask, token_type_ids, offset_mapping

    def _decode(
        self, 
        token_ids, 
        skip_special_tokens = False, 
        clean_up_tokenization_spaces = True,
    ):
        if isinstance(token_ids, int): token_ids = [token_ids]
        if skip_special_tokens:
            token_ids = [i for i in token_ids if i not in self.all_special_ids]
        return self._tokenizer.decode(token_ids)

    def get_added_vocab(self) -> Dict[str, int]:
        return {}
    
    def get_vocab(self):
        return self._tokenizer.vocab
    
    @property
    def vocab(self):
        return self._tokenizer.vocab
    
    @property
    def vocab_size(self) -> int:
        return len(self._tokenizer.vocab)

    def __len__(self):
        return len(self._tokenizer)

    @property
    def is_fast(self) -> bool:
        return False

    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        if tokens is None:
            return None

        if isinstance(tokens, str):
            return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id)

        ids = []
        for token in tokens:
            ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id))
        return ids

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        if self._post_processor == 'bert':
            return 3 if pair else 2
        return 0

    def _add_tokens(self, new_tokens, special_tokens = False) -> int:
        if all(t in self.vocab for t in new_tokens):
            return 0
        raise NotImplementedError("`KiwiTokenizer.add_tokens` is not support yet.")

    def tokenize(
        self, 
        text: str, 
        pair: Optional[str] = None, 
        add_special_tokens: bool = False,
    ) -> List[str]:
        ids = self.encode(text, pair, add_special_tokens=add_special_tokens)
        return list(map(self._tokenizer.id2vocab.__getitem__, ids))

    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        if isinstance(ids, int):
            if skip_special_tokens and ids in self.all_special_ids: return ''
            return self._tokenizer.id2vocab[ids]

        if skip_special_tokens:
            ids = [i for i in ids if i not in self.all_special_ids]
        return list(map(self._tokenizer.id2vocab.__getitem__, ids))

    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
        save_directory = str(save_directory)

        if self.slow_tokenizer_class is None and legacy_format is True:
            raise ValueError(
                "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You "
                "might consider leaving the legacy_format at `None` or setting it to `False`."
            )

        tokenizer_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.json"
        )
        self._tokenizer.save(tokenizer_file)
        file_names = file_names + (tokenizer_file,)

        return file_names

Ancestors

transformers.tokenization_utils_base.PreTrainedTokenizerBase
transformers.tokenization_utils_base.SpecialTokensMixin
transformers.utils.hub.PushToHubMixin

Class variables

var max_model_input_sizes : Dict[str, Optional[int]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]

Instance variables

var bos_token : str

str: Beginning of sentence token. Log an error if used while not having been set.

Expand source code

@property
def bos_token(self) -> str:
    return self._tokenizer.bos_token

var bos_token_id : str

Optional[int]: Id of the beginning of sentence token in the vocabulary. Returns None if the token has not been set.

Expand source code

@property
def bos_token_id(self) -> str:
    return self._tokenizer.bos_token_id

var cls_token : str

str: Classification token, to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set.

Expand source code

@property
def cls_token(self) -> str:
    return self._tokenizer.cls_token

var cls_token_id : str

Optional[int]: Id of the classification token in the vocabulary, to extract a summary of an input sequence leveraging self-attention along the full depth of the model.

Returns None if the token has not been set.

Expand source code

@property
def cls_token_id(self) -> str:
    return self._tokenizer.cls_token_id

var eos_token : str

str: End of sentence token. Log an error if used while not having been set.

Expand source code

@property
def eos_token(self) -> str:
    return self._tokenizer.eos_token

var eos_token_id : str

Optional[int]: Id of the end of sentence token in the vocabulary. Returns None if the token has not been set.

Expand source code

@property
def eos_token_id(self) -> str:
    return self._tokenizer.eos_token_id

var is_fast : bool

Expand source code

@property
def is_fast(self) -> bool:
    return False

var mask_token : str

str: Mask token, to use when training a model with masked-language modeling. Log an error if used while not having been set.

Expand source code

@property
def mask_token(self) -> str:
    return self._tokenizer.mask_token

var mask_token_id : str

Optional[int]: Id of the mask token in the vocabulary, used when training a model with masked-language modeling. Returns None if the token has not been set.

Expand source code

@property
def mask_token_id(self) -> str:
    return self._tokenizer.mask_token_id

var pad_token : str

str: Padding token. Log an error if used while not having been set.

Expand source code

@property
def pad_token(self) -> str:
    return self._tokenizer.pad_token

var pad_token_id : str

Optional[int]: Id of the padding token in the vocabulary. Returns None if the token has not been set.

Expand source code

@property
def pad_token_id(self) -> str:
    return self._tokenizer.pad_token_id

var sep_token : str

str: Separation token, to separate context and query in an input sequence. Log an error if used while not having been set.

Expand source code

@property
def sep_token(self) -> str:
    return self._tokenizer.sep_token

var sep_token_id : str

Optional[int]: Id of the separation token in the vocabulary, to separate context and query in an input sequence. Returns None if the token has not been set.

Expand source code

@property
def sep_token_id(self) -> str:
    return self._tokenizer.sep_token_id

var unk_token : str

str: Unknown token. Log an error if used while not having been set.

Expand source code

@property
def unk_token(self) -> str:
    return self._tokenizer.unk_token

var unk_token_id : str

Optional[int]: Id of the unknown token in the vocabulary. Returns None if the token has not been set.

Expand source code

@property
def unk_token_id(self) -> str:
    return self._tokenizer.unk_token_id

var vocab

Expand source code

@property
def vocab(self):
    return self._tokenizer.vocab

var vocab_size : int

Expand source code

@property
def vocab_size(self) -> int:
    return len(self._tokenizer.vocab)

Methods

def convert_ids_to_tokens(self, ids: Union[int, List[int]], skip_special_tokens: bool = False) ‑> Union[str, List[str]]

Expand source code

def convert_ids_to_tokens(
    self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
    if isinstance(ids, int):
        if skip_special_tokens and ids in self.all_special_ids: return ''
        return self._tokenizer.id2vocab[ids]

    if skip_special_tokens:
        ids = [i for i in ids if i not in self.all_special_ids]
    return list(map(self._tokenizer.id2vocab.__getitem__, ids))

def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) ‑> Union[int, List[int]]

Expand source code

def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
    if tokens is None:
        return None

    if isinstance(tokens, str):
        return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id)

    ids = []
    for token in tokens:
        ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id))
    return ids

def get_added_vocab(self) ‑> Dict[str, int]

Expand source code

def get_added_vocab(self) -> Dict[str, int]:
    return {}

def get_vocab(self)

Returns the vocabulary as a dictionary of token to index.

tokenizer.get_vocab()[token] is equivalent to tokenizer.convert_tokens_to_ids(token) when token is in the vocab.

Returns

Dict[str, int]: The vocabulary.

Expand source code

def get_vocab(self):
    return self._tokenizer.vocab

def num_special_tokens_to_add(self, pair: bool = False) ‑> int

Expand source code

def num_special_tokens_to_add(self, pair: bool = False) -> int:
    if self._post_processor == 'bert':
        return 3 if pair else 2
    return 0

def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) ‑> List[str]

Converts a string in a sequence of tokens, replacing unknown tokens with the unk_token.

Args

text (str): The sequence to be encoded. pair (str, optional): A second sequence to be encoded with the first. add_special_tokens (bool, optional, defaults to False): Whether or not to add the special tokens associated with the corresponding model. kwargs (additional keyword arguments, optional): Will be passed to the underlying model specific encode method. See details in [~PreTrainedTokenizerBase.__call__]

Returns

List[str]: The list of tokens.

Expand source code

def tokenize(
    self, 
    text: str, 
    pair: Optional[str] = None, 
    add_special_tokens: bool = False,
) -> List[str]:
    ids = self.encode(text, pair, add_special_tokens=add_special_tokens)
    return list(map(self._tokenizer.id2vocab.__getitem__, ids))