Module kiwipiepy.transformers_addon
Added in version: 0.15.1
transformers_addon
모듈은 Kiwi의 SwTokenizer를
huggingface transformers의 tokenizer와
호환이 가능하도록 래핑한 KiwiTokenizer
클래스를 제공합니다.
이 기능을 사용하기 위해서는 transformers>=4.12
이 필요합니다.
from transformers import AutoTokenizer
import kiwipiepy.transformers_addon
# kiwipiepy.transformers_addon를 import하면
# 자동으로 KiwiTokenizer가 AutoTokenizer에 register됩니다.
# KiwiTokenizer는 huggingface transformers의 토크나이저와 대부분의 기능이 호환되므로
# 기존 transformers 기반의 코드를 그대로 재사용할 수 있습니다.
# SwTokenizer가 some_path/tokenizer.json에 저장되어있다고 가정
tokenizer = AutoTokenizer.from_pretrained('some_path')
tokenizer.encode("한국어를 고려한 토크나이저!")
KiwiTokenizer
의 주요 기능들은 transformers와 대부분 호환됩니다만, 다음 기능들은 현재 호환되지 않습니다.
add_tokens
,add_special_tokens
등 새 토큰을 추가하는 기능encode_plus
의stride
,is_split_into_words
,return_overflowing_tokens
,return_special_tokens_mask
,return_length
인자
Expand source code
'''
.. versionadded:: 0.15.1
`transformers_addon` 모듈은 Kiwi의 SwTokenizer를
[huggingface transformers의 tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer)와
호환이 가능하도록 래핑한 `KiwiTokenizer` 클래스를 제공합니다.
이 기능을 사용하기 위해서는 `transformers>=4.12` 이 필요합니다.
```python
from transformers import AutoTokenizer
import kiwipiepy.transformers_addon
# kiwipiepy.transformers_addon를 import하면
# 자동으로 KiwiTokenizer가 AutoTokenizer에 register됩니다.
# KiwiTokenizer는 huggingface transformers의 토크나이저와 대부분의 기능이 호환되므로
# 기존 transformers 기반의 코드를 그대로 재사용할 수 있습니다.
# SwTokenizer가 some_path/tokenizer.json에 저장되어있다고 가정
tokenizer = AutoTokenizer.from_pretrained('some_path')
tokenizer.encode("한국어를 고려한 토크나이저!")
```
`KiwiTokenizer`의 주요 기능들은 transformers와 대부분 호환됩니다만, 다음 기능들은 현재 호환되지 않습니다.
* `add_tokens`, `add_special_tokens` 등 새 토큰을 추가하는 기능
* `encode_plus`의 `stride`, `is_split_into_words`, `return_overflowing_tokens`, `return_special_tokens_mask`, `return_length` 인자
'''
import os
import itertools
from typing import Union, List, Optional, Dict, Tuple
import numpy as np
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import (
PreTrainedTokenizerBase,
TextInput,
TextInputPair,
PreTokenizedInput,
PreTokenizedInputPair,
EncodedInput,
EncodedInputPair,
PaddingStrategy,
TruncationStrategy,
TensorType,
BatchEncoding,
)
from kiwipiepy.sw_tokenizer import SwTokenizer, SwTokenizerConfig
def _group_by_two(iterator):
try:
while True:
a = next(iterator)
b = next(iterator)
yield a, b
except StopIteration:
pass
class KiwiTokenizer(PreTrainedTokenizerBase):
vocab_files_names = {"tokenizer_file": "tokenizer.json"}
def __init__(self, tokenizer_file=None, **kwargs):
if tokenizer_file is None:
raise ValueError(f"Cannot instantiate tokenizer from {tokenizer_file!r}")
self._tokenizer = SwTokenizer(tokenizer_file)
super().__init__(**kwargs)
self._post_processor = self._tokenizer.config.additional.get('post_processor') if isinstance(self._tokenizer.config.additional, dict) else None
if self._post_processor not in (None, 'bert'):
raise ValueError(f"Unknown post_processor `{self._post_processor!r}`")
self._bos_token = self._tokenizer.bos_token
self._eos_token = self._tokenizer.eos_token
self._unk_token = self._tokenizer.unk_token
self._sep_token = self._tokenizer.sep_token
self._pad_token = self._tokenizer.pad_token
self._cls_token = self._tokenizer.cls_token
self._mask_token = self._tokenizer.mask_token
@property
def unk_token(self) -> str:
return self._tokenizer.unk_token
@unk_token.setter
def unk_token(self, s):
if s != self._tokenizer.unk_token:
raise AttributeError("can't set attribute 'unk_token'")
@property
def cls_token(self) -> str:
return self._tokenizer.cls_token
@cls_token.setter
def cls_token(self, s):
if s != self._tokenizer.cls_token:
raise AttributeError("can't set attribute 'cls_token'")
@property
def sep_token(self) -> str:
return self._tokenizer.sep_token
@sep_token.setter
def sep_token(self, s):
if s != self._tokenizer.sep_token:
raise AttributeError("can't set attribute 'sep_token'")
@property
def pad_token(self) -> str:
return self._tokenizer.pad_token
@pad_token.setter
def pad_token(self, s):
if s != self._tokenizer.pad_token:
raise AttributeError("can't set attribute 'pad_token'")
@property
def mask_token(self) -> str:
return self._tokenizer.mask_token
@mask_token.setter
def mask_token(self, s):
if s != self._tokenizer.mask_token:
raise AttributeError("can't set attribute 'mask_token'")
@property
def bos_token(self) -> str:
return self._tokenizer.bos_token
@bos_token.setter
def bos_token(self, s):
if s != self._tokenizer.bos_token:
raise AttributeError("can't set attribute 'bos_token'")
@property
def eos_token(self) -> str:
return self._tokenizer.eos_token
@eos_token.setter
def eos_token(self, s):
if s != self._tokenizer.eos_token:
raise AttributeError("can't set attribute 'eos_token'")
@property
def unk_token_id(self) -> str:
return self._tokenizer.unk_token_id
@property
def cls_token_id(self) -> str:
return self._tokenizer.cls_token_id
@property
def sep_token_id(self) -> str:
return self._tokenizer.sep_token_id
@property
def pad_token_id(self) -> str:
return self._tokenizer.pad_token_id
@property
def mask_token_id(self) -> str:
return self._tokenizer.mask_token_id
@property
def bos_token_id(self) -> str:
return self._tokenizer.bos_token_id
@property
def eos_token_id(self) -> str:
return self._tokenizer.eos_token_id
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
List[EncodedInput],
List[EncodedInputPair],
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if not isinstance(batch_text_or_text_pairs, (list, tuple)):
raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
batch_text_or_text_pairs, add_special_tokens,
return_token_type_ids, return_attention_mask, return_offsets_mapping,
return_as_list=(return_tensors is None),
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
)
data = dict(input_ids=input_ids)
if return_attention_mask: data['attention_mask'] = attention_mask
if return_token_type_ids: data['token_type_ids'] = token_type_ids
if return_offsets_mapping: data['offset_mapping'] = offset_mapping
for i in input_ids:
self._eventual_warn_about_too_long_sequence(i, max_length, verbose)
return BatchEncoding(data, tensor_type=return_tensors)
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
text = text if text_pair is None else (text, text_pair)
input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded(
[text], add_special_tokens,
return_token_type_ids, return_attention_mask, return_offsets_mapping,
return_as_list=(return_tensors is None),
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
)
if return_tensors is None and not return_overflowing_tokens:
input_ids = input_ids[0]
if return_attention_mask: attention_mask = attention_mask[0]
if return_token_type_ids: token_type_ids = token_type_ids[0]
if return_offsets_mapping: offset_mapping = offset_mapping[0]
self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
else:
self._eventual_warn_about_too_long_sequence(input_ids[0], max_length, verbose)
data = dict(input_ids=input_ids)
if return_attention_mask: data['attention_mask'] = attention_mask
if return_token_type_ids: data['token_type_ids'] = token_type_ids
if return_offsets_mapping: data['offset_mapping'] = offset_mapping
return BatchEncoding(data, tensor_type=return_tensors)
def _make_encoded(
self,
batch_text_or_text_pairs,
add_special_tokens,
return_token_type_ids,
return_attention_mask,
return_offsets_mapping,
return_as_list = False,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
):
input_ids = []
attention_mask = []
token_type_ids = []
offset_mapping = []
if isinstance(batch_text_or_text_pairs[0], str): # single
special_token_size = self.num_special_tokens_to_add(False) if add_special_tokens else 0
for i in self._tokenizer.encode(batch_text_or_text_pairs, return_offsets=return_offsets_mapping):
if return_offsets_mapping:
i, i_offset = i
i_offset = i_offset.astype(np.int64)
i = i.astype(np.int64)
if (truncation_strategy in (TruncationStrategy.LONGEST_FIRST, TruncationStrategy.ONLY_FIRST)
and len(i) > max_length - special_token_size):
i = i[:max_length - special_token_size]
if return_offsets_mapping:
i_offset = i_offset[:max_length - special_token_size]
if add_special_tokens and self._post_processor == 'bert':
i = np.pad(i.astype(np.int64), (1, 1))
i[0] = self._tokenizer.cls_token_id
i[-1] = self._tokenizer.sep_token_id
if return_offsets_mapping:
i_offset = np.pad(i_offset, ((1, 1), (0, 0)))
input_ids.append(i)
if return_attention_mask: attention_mask.append(np.ones_like(i))
if return_token_type_ids: token_type_ids.append(np.zeros_like(i))
if return_offsets_mapping: offset_mapping.append(i_offset)
else: # pair
special_token_size = self.num_special_tokens_to_add(True) if add_special_tokens else 0
for i, j in _group_by_two(self._tokenizer.encode(itertools.chain.from_iterable(batch_text_or_text_pairs), return_offsets=return_offsets_mapping)):
if return_offsets_mapping:
i, i_offset = i
j, j_offset = j
i_offset = i_offset.astype(np.int64)
j_offset = j_offset.astype(np.int64)
if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
and len(i) + len(j) > max_length - special_token_size):
if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
t = len(i) + len(j) - (max_length - special_token_size)
d = abs(len(i) - len(j))
trunc_size = min(d, t)
if len(i) > len(j):
i = i[:-trunc_size]
if return_offsets_mapping:
i_offset = i_offset[:-trunc_size]
else:
j = j[:-trunc_size]
if return_offsets_mapping:
j_offset = j_offset[:-trunc_size]
if t > d:
i = i[:-((t - d + 1) // 2)]
j = j[:-((t - d) // 2)]
if return_offsets_mapping:
i_offset = i_offset[:-((t - d + 1) // 2)]
j_offset = j_offset[:-((t - d) // 2)]
elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
i = i[:max(max_length - special_token_size - len(j), 0)]
if return_offsets_mapping:
i_offset = i_offset[:max(max_length - special_token_size - len(j), 0)]
elif truncation_strategy == TruncationStrategy.ONLY_SECOND:
j = j[:max(max_length - special_token_size - len(i), 0)]
if return_offsets_mapping:
j_offset = j_offset[:max(max_length - special_token_size - len(i), 0)]
if add_special_tokens and self._post_processor == 'bert':
c = np.concatenate([[self._tokenizer.cls_token_id], i, [self._tokenizer.sep_token_id], j, [self._tokenizer.sep_token_id]])
t = (np.arange(len(c)) >= len(i) + 2).astype(np.int64)
if return_offsets_mapping:
c_offset = np.concatenate([np.pad(i_offset, ((1, 1), (0, 0))), np.pad(j_offset, ((0, 1), (0, 0)))], axis=0)
else:
c = np.concatenate([i, j])
t = (np.arange(len(c)) >= len(i)).astype(np.int64)
if return_offsets_mapping:
c_offset = np.concatenate([i_offset, j_offset], axis=0)
input_ids.append(c)
if return_attention_mask: attention_mask.append(np.ones_like(c))
if return_token_type_ids: token_type_ids.append(t)
if return_offsets_mapping: offset_mapping.append(c_offset)
if padding_strategy == PaddingStrategy.LONGEST:
final_length = max(map(len, input_ids))
if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
elif padding_strategy == PaddingStrategy.MAX_LENGTH:
final_length = max(max(map(len, input_ids)), max_length or 0)
if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
else:
final_length = None
if final_length:
for i in range(len(input_ids)):
input_ids[i] = np.pad(input_ids[i], (0, final_length - len(input_ids[i])), constant_values=self._tokenizer.pad_token_id)
try: attention_mask[i] = np.pad(attention_mask[i], (0, final_length - len(attention_mask[i])))
except IndexError: pass
try: token_type_ids[i] = np.pad(token_type_ids[i], (0, final_length - len(token_type_ids[i])))
except IndexError: pass
try: offset_mapping[i] = np.pad(offset_mapping[i], ((0, final_length - len(offset_mapping[i])), (0, 0)))
except IndexError: pass
if return_as_list:
input_ids = list(map(np.ndarray.tolist, input_ids))
attention_mask = list(map(np.ndarray.tolist, attention_mask))
token_type_ids = list(map(np.ndarray.tolist, token_type_ids))
offset_mapping = list(map(lambda x:list(map(tuple, x)), (map(np.ndarray.tolist, offset_mapping))))
return input_ids, attention_mask, token_type_ids, offset_mapping
def _decode(
self,
token_ids,
skip_special_tokens = False,
clean_up_tokenization_spaces = True,
):
if isinstance(token_ids, int): token_ids = [token_ids]
if skip_special_tokens:
token_ids = [i for i in token_ids if i not in self.all_special_ids]
return self._tokenizer.decode(token_ids)
def get_added_vocab(self) -> Dict[str, int]:
return {}
def get_vocab(self):
return self._tokenizer.vocab
@property
def vocab(self):
return self._tokenizer.vocab
@property
def vocab_size(self) -> int:
return len(self._tokenizer.vocab)
def __len__(self):
return len(self._tokenizer)
@property
def is_fast(self) -> bool:
return False
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
if tokens is None:
return None
if isinstance(tokens, str):
return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id)
ids = []
for token in tokens:
ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id))
return ids
def num_special_tokens_to_add(self, pair: bool = False) -> int:
if self._post_processor == 'bert':
return 3 if pair else 2
return 0
def _add_tokens(self, new_tokens, special_tokens = False) -> int:
if all(t in self.vocab for t in new_tokens):
return 0
raise NotImplementedError("`KiwiTokenizer.add_tokens` is not support yet.")
def tokenize(
self,
text: str,
pair: Optional[str] = None,
add_special_tokens: bool = False,
) -> List[str]:
ids = self.encode(text, pair, add_special_tokens=add_special_tokens)
return list(map(self._tokenizer.id2vocab.__getitem__, ids))
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
if isinstance(ids, int):
if skip_special_tokens and ids in self.all_special_ids: return ''
return self._tokenizer.id2vocab[ids]
if skip_special_tokens:
ids = [i for i in ids if i not in self.all_special_ids]
return list(map(self._tokenizer.id2vocab.__getitem__, ids))
def _save_pretrained(
self,
save_directory: Union[str, os.PathLike],
file_names: Tuple[str],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
save_directory = str(save_directory)
if self.slow_tokenizer_class is None and legacy_format is True:
raise ValueError(
"Your tokenizer does not have a legacy version defined and therefore cannot register this version. You "
"might consider leaving the legacy_format at `None` or setting it to `False`."
)
tokenizer_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.json"
)
self._tokenizer.save(tokenizer_file)
file_names = file_names + (tokenizer_file,)
return file_names
@property
def added_tokens_decoder(self):
return {}
AutoTokenizer.register('KiwiTokenizer', None, KiwiTokenizer)
Classes
class KiwiTokenizer (tokenizer_file=None, **kwargs)
-
Base class for [
PreTrainedTokenizer
] and [PreTrainedTokenizerFast
].Handles shared (mostly boiler plate) methods for those two classes.
Class attributes (overridden by derived classes)
- **vocab_files_names** (<code>Dict\[str, str]</code>) -- A dictionary with, as keys, the <code>\_\_init\_\_</code> keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - **pretrained_vocab_files_map** (<code>Dict\[str, Dict\[str, str]]</code>) -- A dictionary of dictionaries, with the high-level keys being the <code>\_\_init\_\_</code> keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` of the pretrained models with, as associated values, the <code>url</code> to the associated pretrained vocabulary file. - **model_input_names** (<code>List\[str]</code>) -- A list of inputs expected in the forward pass of the model. - **padding_side** (<code>str</code>) -- The default value for the side on which the model should have padding applied. Should be `'right'` or `'left'`. - **truncation_side** (<code>str</code>) -- The default value for the side on which the model should have truncation applied. Should be `'right'` or `'left'`.
Args
model_max_length (
int
, optional): The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is loaded with [~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained
], this will be set to the value stored for the associated model inmax_model_input_sizes
(see above). If no value is provided, will default to VERY_LARGE_INTEGER (int(1e30)
). padding_side (str
, optional): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. truncation_side (str
, optional): The side on which the model should have truncation applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. chat_template (str
, optional): A Jinja template string that will be used to format lists of chat messages. See https://huggingface.co/docs/transformers/chat_templating for a full description. model_input_names (List[string]
, optional): The list of inputs accepted by the forward pass of the model (like"token_type_ids"
or"attention_mask"
). Default value is picked from the class attribute of the same name. bos_token (str
ortokenizers.AddedToken
, optional): A special token representing the beginning of a sentence. Will be associated toself.bos_token
andself.bos_token_id
. eos_token (str
ortokenizers.AddedToken
, optional): A special token representing the end of a sentence. Will be associated toself.eos_token
andself.eos_token_id
. unk_token (str
ortokenizers.AddedToken
, optional): A special token representing an out-of-vocabulary token. Will be associated toself.unk_token
andself.unk_token_id
. sep_token (str
ortokenizers.AddedToken
, optional): A special token separating two different sentences in the same input (used by BERT for instance). Will be associated toself.sep_token
andself.sep_token_id
. pad_token (str
ortokenizers.AddedToken
, optional): A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. Will be associated toself.pad_token
andself.pad_token_id
. cls_token (str
ortokenizers.AddedToken
, optional): A special token representing the class of the input (used by BERT for instance). Will be associated toself.cls_token
andself.cls_token_id
. mask_token (str
ortokenizers.AddedToken
, optional): A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). Will be associated toself.mask_token
andself.mask_token_id
. additional_special_tokens (tuple or list ofstr
ortokenizers.AddedToken
, optional): A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding withskip_special_tokens
is set to True. If they are not part of the vocabulary, they will be added at the end of the vocabulary. clean_up_tokenization_spaces (bool
, optional, defaults toTrue
): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. split_special_tokens (bool
, optional, defaults toFalse
): Whether or not the special tokens should be split during the tokenization process. Passing will affect the internal state of the tokenizer. The default behavior is to not split special tokens. This means that if<s>
is thebos_token
, thentokenizer.tokenize("<s>") = ['<s>
]. Otherwise, ifsplit_special_tokens=True
, thentokenizer.tokenize("<s>")
will be give['<','s', '>']
.Expand source code
class KiwiTokenizer(PreTrainedTokenizerBase): vocab_files_names = {"tokenizer_file": "tokenizer.json"} def __init__(self, tokenizer_file=None, **kwargs): if tokenizer_file is None: raise ValueError(f"Cannot instantiate tokenizer from {tokenizer_file!r}") self._tokenizer = SwTokenizer(tokenizer_file) super().__init__(**kwargs) self._post_processor = self._tokenizer.config.additional.get('post_processor') if isinstance(self._tokenizer.config.additional, dict) else None if self._post_processor not in (None, 'bert'): raise ValueError(f"Unknown post_processor `{self._post_processor!r}`") self._bos_token = self._tokenizer.bos_token self._eos_token = self._tokenizer.eos_token self._unk_token = self._tokenizer.unk_token self._sep_token = self._tokenizer.sep_token self._pad_token = self._tokenizer.pad_token self._cls_token = self._tokenizer.cls_token self._mask_token = self._tokenizer.mask_token @property def unk_token(self) -> str: return self._tokenizer.unk_token @unk_token.setter def unk_token(self, s): if s != self._tokenizer.unk_token: raise AttributeError("can't set attribute 'unk_token'") @property def cls_token(self) -> str: return self._tokenizer.cls_token @cls_token.setter def cls_token(self, s): if s != self._tokenizer.cls_token: raise AttributeError("can't set attribute 'cls_token'") @property def sep_token(self) -> str: return self._tokenizer.sep_token @sep_token.setter def sep_token(self, s): if s != self._tokenizer.sep_token: raise AttributeError("can't set attribute 'sep_token'") @property def pad_token(self) -> str: return self._tokenizer.pad_token @pad_token.setter def pad_token(self, s): if s != self._tokenizer.pad_token: raise AttributeError("can't set attribute 'pad_token'") @property def mask_token(self) -> str: return self._tokenizer.mask_token @mask_token.setter def mask_token(self, s): if s != self._tokenizer.mask_token: raise AttributeError("can't set attribute 'mask_token'") @property def bos_token(self) -> str: return self._tokenizer.bos_token @bos_token.setter def bos_token(self, s): if s != self._tokenizer.bos_token: raise AttributeError("can't set attribute 'bos_token'") @property def eos_token(self) -> str: return self._tokenizer.eos_token @eos_token.setter def eos_token(self, s): if s != self._tokenizer.eos_token: raise AttributeError("can't set attribute 'eos_token'") @property def unk_token_id(self) -> str: return self._tokenizer.unk_token_id @property def cls_token_id(self) -> str: return self._tokenizer.cls_token_id @property def sep_token_id(self) -> str: return self._tokenizer.sep_token_id @property def pad_token_id(self) -> str: return self._tokenizer.pad_token_id @property def mask_token_id(self) -> str: return self._tokenizer.mask_token_id @property def bos_token_id(self) -> str: return self._tokenizer.bos_token_id @property def eos_token_id(self) -> str: return self._tokenizer.eos_token_id def _batch_encode_plus( self, batch_text_or_text_pairs: Union[ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair], List[EncodedInput], List[EncodedInputPair], ], add_special_tokens: bool = True, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs, ) -> BatchEncoding: if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names if not isinstance(batch_text_or_text_pairs, (list, tuple)): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded( batch_text_or_text_pairs, add_special_tokens, return_token_type_ids, return_attention_mask, return_offsets_mapping, return_as_list=(return_tensors is None), padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, ) data = dict(input_ids=input_ids) if return_attention_mask: data['attention_mask'] = attention_mask if return_token_type_ids: data['token_type_ids'] = token_type_ids if return_offsets_mapping: data['offset_mapping'] = offset_mapping for i in input_ids: self._eventual_warn_about_too_long_sequence(i, max_length, verbose) return BatchEncoding(data, tensor_type=return_tensors) def _encode_plus( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs, ) -> BatchEncoding: if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names text = text if text_pair is None else (text, text_pair) input_ids, attention_mask, token_type_ids, offset_mapping = self._make_encoded( [text], add_special_tokens, return_token_type_ids, return_attention_mask, return_offsets_mapping, return_as_list=(return_tensors is None), padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, ) if return_tensors is None and not return_overflowing_tokens: input_ids = input_ids[0] if return_attention_mask: attention_mask = attention_mask[0] if return_token_type_ids: token_type_ids = token_type_ids[0] if return_offsets_mapping: offset_mapping = offset_mapping[0] self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose) else: self._eventual_warn_about_too_long_sequence(input_ids[0], max_length, verbose) data = dict(input_ids=input_ids) if return_attention_mask: data['attention_mask'] = attention_mask if return_token_type_ids: data['token_type_ids'] = token_type_ids if return_offsets_mapping: data['offset_mapping'] = offset_mapping return BatchEncoding(data, tensor_type=return_tensors) def _make_encoded( self, batch_text_or_text_pairs, add_special_tokens, return_token_type_ids, return_attention_mask, return_offsets_mapping, return_as_list = False, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, ): input_ids = [] attention_mask = [] token_type_ids = [] offset_mapping = [] if isinstance(batch_text_or_text_pairs[0], str): # single special_token_size = self.num_special_tokens_to_add(False) if add_special_tokens else 0 for i in self._tokenizer.encode(batch_text_or_text_pairs, return_offsets=return_offsets_mapping): if return_offsets_mapping: i, i_offset = i i_offset = i_offset.astype(np.int64) i = i.astype(np.int64) if (truncation_strategy in (TruncationStrategy.LONGEST_FIRST, TruncationStrategy.ONLY_FIRST) and len(i) > max_length - special_token_size): i = i[:max_length - special_token_size] if return_offsets_mapping: i_offset = i_offset[:max_length - special_token_size] if add_special_tokens and self._post_processor == 'bert': i = np.pad(i.astype(np.int64), (1, 1)) i[0] = self._tokenizer.cls_token_id i[-1] = self._tokenizer.sep_token_id if return_offsets_mapping: i_offset = np.pad(i_offset, ((1, 1), (0, 0))) input_ids.append(i) if return_attention_mask: attention_mask.append(np.ones_like(i)) if return_token_type_ids: token_type_ids.append(np.zeros_like(i)) if return_offsets_mapping: offset_mapping.append(i_offset) else: # pair special_token_size = self.num_special_tokens_to_add(True) if add_special_tokens else 0 for i, j in _group_by_two(self._tokenizer.encode(itertools.chain.from_iterable(batch_text_or_text_pairs), return_offsets=return_offsets_mapping)): if return_offsets_mapping: i, i_offset = i j, j_offset = j i_offset = i_offset.astype(np.int64) j_offset = j_offset.astype(np.int64) if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and len(i) + len(j) > max_length - special_token_size): if truncation_strategy == TruncationStrategy.LONGEST_FIRST: t = len(i) + len(j) - (max_length - special_token_size) d = abs(len(i) - len(j)) trunc_size = min(d, t) if len(i) > len(j): i = i[:-trunc_size] if return_offsets_mapping: i_offset = i_offset[:-trunc_size] else: j = j[:-trunc_size] if return_offsets_mapping: j_offset = j_offset[:-trunc_size] if t > d: i = i[:-((t - d + 1) // 2)] j = j[:-((t - d) // 2)] if return_offsets_mapping: i_offset = i_offset[:-((t - d + 1) // 2)] j_offset = j_offset[:-((t - d) // 2)] elif truncation_strategy == TruncationStrategy.ONLY_FIRST: i = i[:max(max_length - special_token_size - len(j), 0)] if return_offsets_mapping: i_offset = i_offset[:max(max_length - special_token_size - len(j), 0)] elif truncation_strategy == TruncationStrategy.ONLY_SECOND: j = j[:max(max_length - special_token_size - len(i), 0)] if return_offsets_mapping: j_offset = j_offset[:max(max_length - special_token_size - len(i), 0)] if add_special_tokens and self._post_processor == 'bert': c = np.concatenate([[self._tokenizer.cls_token_id], i, [self._tokenizer.sep_token_id], j, [self._tokenizer.sep_token_id]]) t = (np.arange(len(c)) >= len(i) + 2).astype(np.int64) if return_offsets_mapping: c_offset = np.concatenate([np.pad(i_offset, ((1, 1), (0, 0))), np.pad(j_offset, ((0, 1), (0, 0)))], axis=0) else: c = np.concatenate([i, j]) t = (np.arange(len(c)) >= len(i)).astype(np.int64) if return_offsets_mapping: c_offset = np.concatenate([i_offset, j_offset], axis=0) input_ids.append(c) if return_attention_mask: attention_mask.append(np.ones_like(c)) if return_token_type_ids: token_type_ids.append(t) if return_offsets_mapping: offset_mapping.append(c_offset) if padding_strategy == PaddingStrategy.LONGEST: final_length = max(map(len, input_ids)) if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of elif padding_strategy == PaddingStrategy.MAX_LENGTH: final_length = max(max(map(len, input_ids)), max_length or 0) if pad_to_multiple_of: final_length = ((final_length + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of else: final_length = None if final_length: for i in range(len(input_ids)): input_ids[i] = np.pad(input_ids[i], (0, final_length - len(input_ids[i])), constant_values=self._tokenizer.pad_token_id) try: attention_mask[i] = np.pad(attention_mask[i], (0, final_length - len(attention_mask[i]))) except IndexError: pass try: token_type_ids[i] = np.pad(token_type_ids[i], (0, final_length - len(token_type_ids[i]))) except IndexError: pass try: offset_mapping[i] = np.pad(offset_mapping[i], ((0, final_length - len(offset_mapping[i])), (0, 0))) except IndexError: pass if return_as_list: input_ids = list(map(np.ndarray.tolist, input_ids)) attention_mask = list(map(np.ndarray.tolist, attention_mask)) token_type_ids = list(map(np.ndarray.tolist, token_type_ids)) offset_mapping = list(map(lambda x:list(map(tuple, x)), (map(np.ndarray.tolist, offset_mapping)))) return input_ids, attention_mask, token_type_ids, offset_mapping def _decode( self, token_ids, skip_special_tokens = False, clean_up_tokenization_spaces = True, ): if isinstance(token_ids, int): token_ids = [token_ids] if skip_special_tokens: token_ids = [i for i in token_ids if i not in self.all_special_ids] return self._tokenizer.decode(token_ids) def get_added_vocab(self) -> Dict[str, int]: return {} def get_vocab(self): return self._tokenizer.vocab @property def vocab(self): return self._tokenizer.vocab @property def vocab_size(self) -> int: return len(self._tokenizer.vocab) def __len__(self): return len(self._tokenizer) @property def is_fast(self) -> bool: return False def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: if tokens is None: return None if isinstance(tokens, str): return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id) ids = [] for token in tokens: ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id)) return ids def num_special_tokens_to_add(self, pair: bool = False) -> int: if self._post_processor == 'bert': return 3 if pair else 2 return 0 def _add_tokens(self, new_tokens, special_tokens = False) -> int: if all(t in self.vocab for t in new_tokens): return 0 raise NotImplementedError("`KiwiTokenizer.add_tokens` is not support yet.") def tokenize( self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, ) -> List[str]: ids = self.encode(text, pair, add_special_tokens=add_special_tokens) return list(map(self._tokenizer.id2vocab.__getitem__, ids)) def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[str, List[str]]: if isinstance(ids, int): if skip_special_tokens and ids in self.all_special_ids: return '' return self._tokenizer.id2vocab[ids] if skip_special_tokens: ids = [i for i in ids if i not in self.all_special_ids] return list(map(self._tokenizer.id2vocab.__getitem__, ids)) def _save_pretrained( self, save_directory: Union[str, os.PathLike], file_names: Tuple[str], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, ) -> Tuple[str]: save_directory = str(save_directory) if self.slow_tokenizer_class is None and legacy_format is True: raise ValueError( "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You " "might consider leaving the legacy_format at `None` or setting it to `False`." ) tokenizer_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.json" ) self._tokenizer.save(tokenizer_file) file_names = file_names + (tokenizer_file,) return file_names @property def added_tokens_decoder(self): return {}
Ancestors
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
Class variables
var model_input_names : List[str]
var padding_side : str
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]
Instance variables
var added_tokens_decoder
-
Expand source code
@property def added_tokens_decoder(self): return {}
var bos_token : str
-
str
: Beginning of sentence token. Log an error if used while not having been set.Expand source code
@property def bos_token(self) -> str: return self._tokenizer.bos_token
var bos_token_id : str
-
Optional[int]
: Id of the beginning of sentence token in the vocabulary. ReturnsNone
if the token has not been set.Expand source code
@property def bos_token_id(self) -> str: return self._tokenizer.bos_token_id
var cls_token : str
-
str
: Classification token, to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set.Expand source code
@property def cls_token(self) -> str: return self._tokenizer.cls_token
var cls_token_id : str
-
Optional[int]
: Id of the classification token in the vocabulary, to extract a summary of an input sequence leveraging self-attention along the full depth of the model.Returns
None
if the token has not been set.Expand source code
@property def cls_token_id(self) -> str: return self._tokenizer.cls_token_id
var eos_token : str
-
str
: End of sentence token. Log an error if used while not having been set.Expand source code
@property def eos_token(self) -> str: return self._tokenizer.eos_token
var eos_token_id : str
-
Optional[int]
: Id of the end of sentence token in the vocabulary. ReturnsNone
if the token has not been set.Expand source code
@property def eos_token_id(self) -> str: return self._tokenizer.eos_token_id
var is_fast : bool
-
Expand source code
@property def is_fast(self) -> bool: return False
var mask_token : str
-
str
: Mask token, to use when training a model with masked-language modeling. Log an error if used while not having been set.Expand source code
@property def mask_token(self) -> str: return self._tokenizer.mask_token
var mask_token_id : str
-
Optional[int]
: Id of the mask token in the vocabulary, used when training a model with masked-language modeling. ReturnsNone
if the token has not been set.Expand source code
@property def mask_token_id(self) -> str: return self._tokenizer.mask_token_id
var pad_token : str
-
str
: Padding token. Log an error if used while not having been set.Expand source code
@property def pad_token(self) -> str: return self._tokenizer.pad_token
var pad_token_id : str
-
Optional[int]
: Id of the padding token in the vocabulary. ReturnsNone
if the token has not been set.Expand source code
@property def pad_token_id(self) -> str: return self._tokenizer.pad_token_id
var sep_token : str
-
str
: Separation token, to separate context and query in an input sequence. Log an error if used while not having been set.Expand source code
@property def sep_token(self) -> str: return self._tokenizer.sep_token
var sep_token_id : str
-
Optional[int]
: Id of the separation token in the vocabulary, to separate context and query in an input sequence. ReturnsNone
if the token has not been set.Expand source code
@property def sep_token_id(self) -> str: return self._tokenizer.sep_token_id
var unk_token : str
-
str
: Unknown token. Log an error if used while not having been set.Expand source code
@property def unk_token(self) -> str: return self._tokenizer.unk_token
var unk_token_id : str
-
Optional[int]
: Id of the unknown token in the vocabulary. ReturnsNone
if the token has not been set.Expand source code
@property def unk_token_id(self) -> str: return self._tokenizer.unk_token_id
var vocab
-
Expand source code
@property def vocab(self): return self._tokenizer.vocab
var vocab_size : int
-
Expand source code
@property def vocab_size(self) -> int: return len(self._tokenizer.vocab)
Methods
def convert_ids_to_tokens(self, ids: Union[int, List[int]], skip_special_tokens: bool = False) ‑> Union[str, List[str]]
-
Expand source code
def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[str, List[str]]: if isinstance(ids, int): if skip_special_tokens and ids in self.all_special_ids: return '' return self._tokenizer.id2vocab[ids] if skip_special_tokens: ids = [i for i in ids if i not in self.all_special_ids] return list(map(self._tokenizer.id2vocab.__getitem__, ids))
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) ‑> Union[int, List[int]]
-
Expand source code
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: if tokens is None: return None if isinstance(tokens, str): return self._tokenizer.vocab.get(tokens, self._tokenizer.unk_token_id) ids = [] for token in tokens: ids.append(self._tokenizer.vocab.get(token, self._tokenizer.unk_token_id)) return ids
def get_added_vocab(self) ‑> Dict[str, int]
-
Expand source code
def get_added_vocab(self) -> Dict[str, int]: return {}
def get_vocab(self)
-
Returns the vocabulary as a dictionary of token to index.
tokenizer.get_vocab()[token]
is equivalent totokenizer.convert_tokens_to_ids(token)
whentoken
is in the vocab.Returns
Dict[str, int]
: The vocabulary.Expand source code
def get_vocab(self): return self._tokenizer.vocab
def num_special_tokens_to_add(self, pair: bool = False) ‑> int
-
Expand source code
def num_special_tokens_to_add(self, pair: bool = False) -> int: if self._post_processor == 'bert': return 3 if pair else 2 return 0
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) ‑> List[str]
-
Converts a string into a sequence of tokens, replacing unknown tokens with the
unk_token
.Args
text (
str
): The sequence to be encoded. pair (str
, optional): A second sequence to be encoded with the first. add_special_tokens (bool
, optional, defaults toFalse
): Whether or not to add the special tokens associated with the corresponding model. kwargs (additional keyword arguments, optional): Will be passed to the underlying model specific encode method. See details in [~PreTrainedTokenizerBase.__call__
]Returns
List[str]
: The list of tokens.Expand source code
def tokenize( self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, ) -> List[str]: ids = self.encode(text, pair, add_special_tokens=add_special_tokens) return list(map(self._tokenizer.id2vocab.__getitem__, ids))