Module tomotopy.utils
Expand source code
from typing import Optional, List
import re
'''
Submodule `tomotopy.utils` provides various utilities for topic modeling.
`tomotopy.utils.Corpus` class helps manage multiple documents easily.
The documents inserted into `Corpus` can be used any topic models, and you can save the corpus preprocessed into a file and load the corpus from a file.
'''
from _tomotopy import (_UtilsCorpus, _UtilsVocabDict)
class Corpus(_UtilsCorpus):
'''`Corpus` class is a utility that makes it easy to manage large amounts of documents.
An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.
'''
class _VocabDict(_UtilsVocabDict):
pass
def __init__(self, tokenizer=None, batch_size=64, stopwords=None):
'''Parameters
----------
tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]
a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters.
`tokenizer` receives two arguments `raw` and `user_data` and
it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word).
batch_size : int
`tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]
When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded.
If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`.
'''
super().__init__()
self._tokenizer = tokenizer
self._batch_size = batch_size
if callable(stopwords):
self._stopwords = stopwords
elif stopwords is None:
self._stopwords = None
else:
self._stopwords = lambda x: x in set(stopwords)
def _select_args_for_model(self, model_type:type, args:dict):
import tomotopy as tp
if model_type in (tp.DMRModel, tp.GDMRModel):
return {k:v for k, v in args.items() if k in ('metadata')}
if model_type in (tp.LLDAModel, tp.PLDAModel):
return {k:v for k, v in args.items() if k in ('labels')}
if model_type is tp.MGLDAModel:
return {k:v for k, v in args.items() if k in ('delimiter')}
if model_type is tp.SLDAModel:
return {k:v for k, v in args.items() if k in ('y')}
if model_type is tp.DTModel:
return {k:v for k, v in args.items() if k in ('timepoint')}
return {}
def add_doc(self, words=None, raw=None, user_data=None, **kargs):
'''Add a new document into the corpus and return an index of the inserted document.
This method requires either `words` parameter or `raw` and `user_data` parameters.
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.
If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.
Parameters
----------
words : Iterable[str]
a list of words that are already preprocessed
raw : str
a raw string of document which isn't preprocessed yet.
The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
arbitrary keyword arguments for specific topic models
'''
return super().add_doc(words, raw, user_data, **kargs)
def process(self, data_feeder, show_progress=False, total=None):
'''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.
Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`).
'''
if self._tokenizer is None:
raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`")
num = [0]
raw_list = []
metadata_list = []
if show_progress:
from tqdm import tqdm
data_feeder_iter = iter(tqdm(data_feeder, total=total))
else:
data_feeder_iter = iter(data_feeder)
def _generate():
for _, d in zip(range(self._batch_size), data_feeder_iter):
num[0] += 1
if isinstance(d, tuple) and len(d) == 2:
raw_list.append(d[0])
metadata_list.append({})
yield d
elif isinstance(d, tuple) and len(d) == 3:
raw_list.append(d[0])
metadata_list.append(d[2])
yield d[:2]
elif isinstance(d, str):
raw_list.append(d)
metadata_list.append({})
yield (d, None)
else:
raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")
while 1:
added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list))
if added == 0: break
raw_list.clear()
metadata_list.clear()
return num[0]
def save(self, filename:str, protocol=0):
'''Save the current instance into the file `filename`.
Parameters
----------
filename : str
a path for the file where the instance is saved
'''
import pickle
with open(filename, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load(filename:str):
'''Load and return an instance from the file `filename`
Parameters
----------
filename : str
a path for the file to be loaded
'''
import pickle
with open(filename, 'rb') as f:
obj = pickle.load(f)
obj._stopwords = None
return obj
def __len__(self):
return super().__len__()
def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0):
'''..versionadded:: 0.10.0
Extract frequent n-grams using PMI score
Parameters
----------
min_cf : int
Minimum collection frequency of n-grams to be extracted
min_df : int
Minimum document frequency of n-grams to be extracted
max_len : int
Maximum length of n-grams to be extracted
max_cand : int
Maximum number of n-grams to be extracted
min_score : float
Minium PMI score of n-grams to be extracted
normalized : bool
whether to use Normalized PMI or just PMI
workers : int
an integer indicating the number of workers to perform samplings.
If `workers` is 0, the number of cores in the system will be used.
Returns
-------
candidates : List[tomotopy.label.Candidate]
The extracted n-gram candidates in `tomotopy.label.Candidate` type
'''
return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)
def concat_ngrams(self, cands, delimiter='_'):
'''..versionadded:: 0.10.0
Concatenate n-gram matched given candidates in the corpus into single word
Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`.
delimiter : str
Delimiter to be used for concatenating words. Default value is `'_'`.
'''
return super().concat_ngrams(cands, delimiter)
class SimpleTokenizer:
'''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.'''
def __init__(self,
stemmer = None,
pattern:str = None,
lowercase = True,
ngram_list:Optional[List[str]] = None,
ngram_delimiter:str = '_',
):
'''Parameters
----------
stemmer : Callable[str, str]
a callable object for stemming words. If this value is set to `None`, words are not stemmed.
pattern : str
a regex pattern for extracting tokens
lowercase : bool
converts the token into lowercase if this is True
Here is an example of using SimpleTokenizer with NLTK for stemming.
.. include:: ./auto_labeling_code_with_porter.rst
'''
self._pat = re.compile(pattern or r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""")
if stemmer and not callable(stemmer):
raise ValueError("`stemmer` must be callable.")
self._stemmer = stemmer or None
self._lowercase = lowercase
self._ngram_pat = None
self._ngram_delimiter = ngram_delimiter
if ngram_list:
self.build_ngram_pat(ngram_list)
def build_ngram_pat(self, ngram_list:List[str]):
ngram_vocab = {}
patterns = []
for ngram in ngram_list:
if self._lowercase:
ngram = ngram.lower()
words = self._pat.findall(ngram)
if len(words) < 2:
continue
chrs = []
for word in words:
if self._stemmer is not None:
word = self._stemmer(word)
try:
wid = ngram_vocab[word]
except KeyError:
wid = chr(len(ngram_vocab) + 256)
ngram_vocab[word] = wid
chrs.append(wid)
patterns.append(''.join(chrs))
if patterns:
self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True)))
self._ngram_vocab = ngram_vocab
def _tokenize(self, raw:str):
if self._ngram_pat is None:
for g in self._pat.finditer(raw):
start, end = g.span()
word = g.group()
if self._lowercase:
word = word.lower()
if self._stemmer is not None:
word = self._stemmer(word)
yield word, start, end - start
else:
all_words = []
all_spans = []
chrs = []
for g in self._pat.finditer(raw):
all_spans.append(g.span())
word = g.group()
if self._lowercase:
word = word.lower()
if self._stemmer is not None:
word = self._stemmer(word)
all_words.append(word)
try:
chrs.append(self._ngram_vocab[word])
except KeyError:
chrs.append(' ')
chrs = ''.join(chrs)
for g in self._ngram_pat.finditer(chrs):
s, e = g.span()
is_space = all(raw[ns:ne].isspace() for (_, ns), (ne, _) in zip(all_spans[s:e-1], all_spans[s+1:e]))
if not is_space:
continue
all_words[s] = self._ngram_delimiter.join(all_words[s:e])
all_words[s+1:e] = [None] * (e - s - 1)
all_spans[s] = (all_spans[s][0], all_spans[e-1][1])
for (s, e), word in zip(all_spans, all_words):
if word is None: continue
yield word, s, e - s
def __call__(self, raw:str, user_data=None):
is_iterable = False
# test raw is iterable
if user_data is None and not isinstance(raw, str):
try:
iter(raw)
is_iterable = True
except TypeError:
pass
if is_iterable:
for r, _ in raw:
yield list(self._tokenize(r))
else:
yield from self._tokenize(raw)
import os
if os.environ.get('TOMOTOPY_LANG') == 'kr':
__doc__ = """`tomotopy.utils` 서브모듈은 토픽 모델링에 유용한 여러 유틸리티를 제공합니다.
`tomotopy.utils.Corpus` 클래스는 대량의 문헌을 관리할 수 있게 돕습니다. `Corpus`에 입력된 문헌들은 다양한 토픽 모델에 바로 입력될 수 있습니다.
또한 코퍼스 전처리 결과를 파일에 저장함으로써 필요에 따라 다시 코퍼스를 파일에서 읽어들여 원하는 토픽 모델에 입력할 수 있습니다.
"""
__pdoc__ = {}
__pdoc__['Corpus'] = """`Corpus`는 대량의 문헌을 간편하게 다룰 수 있게 도와주는 유틸리티 클래스입니다.
`Corpus` 클래스의 인스턴스는 여러 개의 문헌을 포함할 수 있으며, 토픽 모델 클래스에 파라미터로 직접 넘겨질 수 있습니다.
Parameters
----------
tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]
비정제 문헌을 처리하는 데에 사용되는 호출 가능한 객체. `tokenizer`가 None이 아닌 값으로 주어진 경우, `tomotopy.utils.Corpus.add_doc` 메소드를 호출할 때 `raw` 및 `user_data` 파라미터를 사용할 수 있습니다.
`tokenizer`는 인수로 `raw`와 `user_data` 2개를 받으며, 처리 결과로 `str`(정제된 단어) 혹은 Tuple[`str`, `int`, `int`] (정제된 단어, 단어 시작 위치, 단어 길이)의 iterable을 반환해야 합니다.
batch_size : int
`tomotopy.utils.Corpus.process` 메소드는 대량의 문헌을 읽어들인 후 `tomotopy.utils.Corpus.add_doc`으로 넘깁니다. 이 때 한번에 읽어들이는 문헌의 개수를 `batch_size`로 지정할 수 있습니다.
stopwords : Iterable[str]
`tomotopy.utils.Corpus.add_doc`가 호출될 때, `stopwords`에 포함된 단어들은 처리 단계에서 등록되지 않고 제외됩니다.
`stopwords`가 호출가능한 경우, `stopwords(word) == True`이면 word는 불용어 처리되어 제외됩니다."""
__pdoc__['Corpus.add_doc'] = """새 문헌을 코퍼스에 추가하고 추가된 문헌의 인덱스 번호를 반환합니다.
이 메소드는 `words` 파라미터나 `raw`, `user_data` 파라미터 둘 중 하나를 요구합니다.
`words` 파라미터를 사용할 경우, `words`는 이미 전처리된 단어들의 리스트여야 합니다.
`raw` 파라미터를 사용할 경우, `raw`는 정제되기 전 문헌의 str이며, `tokenizer`가 이 비정제문헌을 처리하기 위해 호출됩니다.
만약 `tomotopy.DMRModel`의 `metadata`나 `tomotopy.SLDAModel`의 `y`처럼 특정한 토픽 모델에 필요한 추가 파라미터가 있다면 임의 키워드 인자로 넘겨줄 수 있습니다.
Parameters
----------
words : Iterable[str]
이미 전처리된 단어들의 리스트
raw : str
전처리되기 이전의 문헌.
이 파라미터를 사용하려면 인스턴스 생성시 `tokenizer` 파라미터를 넣어줘야 합니다.
user_data : Any
`tokenizer`에 넘어가는 유저 데이터. `raw`와 `user_data` 파라미터가 함께 `tokenizer`로 넘어갑니다.
**kargs
추가적인 파라미터를 위한 임의 키워드 인자"""
__pdoc__['Corpus.process'] = """이터레이터 `data_feeder`를 통해 다수의 문헌을 코퍼스에 추가하고, 추가된 문헌의 개수를 반환합니다.
Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
문자열 `raw`이나, 튜플 (`raw`, `user_data`), 혹은 튜플 (`raw`, `user_data`, `kargs`) 를 반환하는 이터레이터. """
__pdoc__['Corpus.save'] = """현재 인스턴스를 파일 `filename`에 저장합니다..
Parameters
----------
filename : str
인스턴스가 저장될 파일의 경로"""
__pdoc__['Corpus.load'] = """파일 `filename`로부터 인스턴스를 읽어들여 반환합니다.
Parameters
----------
filename : str
읽어들일 파일의 경로"""
__pdoc__['Corpus.extract_ngrams'] = '''..versionadded:: 0.10.0
PMI 점수를 이용해 자주 등장하는 n-gram들을 추출합니다.
Parameters
----------
min_cf : int
추출할 n-gram의 최소 장서빈도
min_df : int
추출할 n-gram의 최소 문헌빈도
max_len : int
추출할 n-gram의 최대 길이
max_cand : int
추출할 n-gram의 갯수
min_score : float
추출할 n-gram의 최소 PMI 점수
Returns
-------
candidates : List[tomotopy.label.Candidate]
추출된 n-gram 후보의 리스트. `tomotopy.label.Candidate` 타입
'''
__pdoc__['Corpus.concat_ngrams'] = '''..versionadded:: 0.10.0
코퍼스 내에서 주어진 n-gram 목록과 일치하는 단어열을 하나의 단어로 합칩니다.
Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
합칠 n-gram의 List. `tomotopy.utils.Corpus.extract_ngrams`로 생성할 수 있습니다.
delimiter : str
여러 단어들을 연결할 때 사용할 구분자. 기본값은 `'_'`입니다.
'''
__pdoc__['SimpleTokenizer'] = """`SimpleTokenizer`는 임의의 스테머를 사용할 수 있는 단순한 단어 분리 유틸리티입니다.
Parameters
----------
stemmer : Callable[str, str]
단어를 스테밍하는데 사용되는 호출가능한 객체. 만약 이 값이 `None`이라면 스테밍은 사용되지 않습니다.
pattern : str
토큰을 추출하는데 사용할 정규식 패턴
lowercase : bool
참일 경우 분리된 단어들을 소문자화합니다.
SimpleTokenizer와 NLTK를 사용하여 스테밍을 하는 예제는 다음과 같습니다.
.. include:: ./auto_labeling_code_with_porter.rst"""
del os
Classes
class Corpus (tokenizer=None, batch_size=64, stopwords=None)
-
Corpus
class is a utility that makes it easy to manage large amounts of documents. An instance ofCorpus
can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.Parameters
tokenizer
:Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]
- a callable object for tokenizing raw documents. If
tokenizer
is provided, you can useCorpus.add_doc()
method withraw
anduser_data
parameters.tokenizer
receives two argumentsraw
anduser_data
and it should return an iterable ofstr
(the tokenized word) or of Tuple[str
,int
,int
] (the tokenized word, starting position of the word, the length of the word). batch_size
:int
Corpus.process()
method reads a bunch of documents and send them toCorpus.add_doc()
.batch_size
indicates the size of the bunch.stopwords
:Union[Iterable[str], Callable[str, bool]]
- When calling
Corpus.add_doc()
, words instopwords
are not added to the document but are excluded. Ifstopwords
is callable, a word is excluded from the document whenstopwords(word) == True
.
Expand source code
class Corpus(_UtilsCorpus): '''`Corpus` class is a utility that makes it easy to manage large amounts of documents. An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes. ''' class _VocabDict(_UtilsVocabDict): pass def __init__(self, tokenizer=None, batch_size=64, stopwords=None): '''Parameters ---------- tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]] a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters. `tokenizer` receives two arguments `raw` and `user_data` and it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word). batch_size : int `tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of the bunch. stopwords : Union[Iterable[str], Callable[str, bool]] When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded. If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`. ''' super().__init__() self._tokenizer = tokenizer self._batch_size = batch_size if callable(stopwords): self._stopwords = stopwords elif stopwords is None: self._stopwords = None else: self._stopwords = lambda x: x in set(stopwords) def _select_args_for_model(self, model_type:type, args:dict): import tomotopy as tp if model_type in (tp.DMRModel, tp.GDMRModel): return {k:v for k, v in args.items() if k in ('metadata')} if model_type in (tp.LLDAModel, tp.PLDAModel): return {k:v for k, v in args.items() if k in ('labels')} if model_type is tp.MGLDAModel: return {k:v for k, v in args.items() if k in ('delimiter')} if model_type is tp.SLDAModel: return {k:v for k, v in args.items() if k in ('y')} if model_type is tp.DTModel: return {k:v for k, v in args.items() if k in ('timepoint')} return {} def add_doc(self, words=None, raw=None, user_data=None, **kargs): '''Add a new document into the corpus and return an index of the inserted document. This method requires either `words` parameter or `raw` and `user_data` parameters. If `words` parameter is provided, `words` are expected to be already preprocessed results. If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document. If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument. Parameters ---------- words : Iterable[str] a list of words that are already preprocessed raw : str a raw string of document which isn't preprocessed yet. The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set. user_data : Any an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`. **kargs arbitrary keyword arguments for specific topic models ''' return super().add_doc(words, raw, user_data, **kargs) def process(self, data_feeder, show_progress=False, total=None): '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted. Parameters ---------- data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]] any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). ''' if self._tokenizer is None: raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`") num = [0] raw_list = [] metadata_list = [] if show_progress: from tqdm import tqdm data_feeder_iter = iter(tqdm(data_feeder, total=total)) else: data_feeder_iter = iter(data_feeder) def _generate(): for _, d in zip(range(self._batch_size), data_feeder_iter): num[0] += 1 if isinstance(d, tuple) and len(d) == 2: raw_list.append(d[0]) metadata_list.append({}) yield d elif isinstance(d, tuple) and len(d) == 3: raw_list.append(d[0]) metadata_list.append(d[2]) yield d[:2] elif isinstance(d, str): raw_list.append(d) metadata_list.append({}) yield (d, None) else: raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]") while 1: added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list)) if added == 0: break raw_list.clear() metadata_list.clear() return num[0] def save(self, filename:str, protocol=0): '''Save the current instance into the file `filename`. Parameters ---------- filename : str a path for the file where the instance is saved ''' import pickle with open(filename, 'wb') as f: pickle.dump(self, f) @staticmethod def load(filename:str): '''Load and return an instance from the file `filename` Parameters ---------- filename : str a path for the file to be loaded ''' import pickle with open(filename, 'rb') as f: obj = pickle.load(f) obj._stopwords = None return obj def __len__(self): return super().__len__() def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0): '''..versionadded:: 0.10.0 Extract frequent n-grams using PMI score Parameters ---------- min_cf : int Minimum collection frequency of n-grams to be extracted min_df : int Minimum document frequency of n-grams to be extracted max_len : int Maximum length of n-grams to be extracted max_cand : int Maximum number of n-grams to be extracted min_score : float Minium PMI score of n-grams to be extracted normalized : bool whether to use Normalized PMI or just PMI workers : int an integer indicating the number of workers to perform samplings. If `workers` is 0, the number of cores in the system will be used. Returns ------- candidates : List[tomotopy.label.Candidate] The extracted n-gram candidates in `tomotopy.label.Candidate` type ''' return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers) def concat_ngrams(self, cands, delimiter='_'): '''..versionadded:: 0.10.0 Concatenate n-gram matched given candidates in the corpus into single word Parameters ---------- cands : Iterable[tomotopy.label.Candidate] n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`. delimiter : str Delimiter to be used for concatenating words. Default value is `'_'`. ''' return super().concat_ngrams(cands, delimiter)
Ancestors
- tomotopy._UtilsCorpus
Static methods
def load(filename: str)
-
Load and return an instance from the file
filename
Parameters
filename
:str
- a path for the file to be loaded
Expand source code
@staticmethod def load(filename:str): '''Load and return an instance from the file `filename` Parameters ---------- filename : str a path for the file to be loaded ''' import pickle with open(filename, 'rb') as f: obj = pickle.load(f) obj._stopwords = None return obj
Methods
def add_doc(self, words=None, raw=None, user_data=None, **kargs)
-
Add a new document into the corpus and return an index of the inserted document. This method requires either
words
parameter orraw
anduser_data
parameters. Ifwords
parameter is provided,words
are expected to be already preprocessed results. Ifraw
parameter is provided,raw
is expected to be a raw string of document which isn't preprocessed yet, andtokenizer
will be called for preprocessing the raw document.If you need additional parameters for a specific topic model, such as
metadata
forDMRModel
ory
forSLDAModel
, you can pass it as an arbitrary keyword argument.Parameters
words
:Iterable[str]
- a list of words that are already preprocessed
raw
:str
- a raw string of document which isn't preprocessed yet.
The
raw
parameter can be used only when thetokenizer
parameter of__init__
is set. user_data
:Any
- an user data for
tokenizer
. Theraw
anduser_data
parameter are sent totokenizer
. **kargs
- arbitrary keyword arguments for specific topic models
Expand source code
def add_doc(self, words=None, raw=None, user_data=None, **kargs): '''Add a new document into the corpus and return an index of the inserted document. This method requires either `words` parameter or `raw` and `user_data` parameters. If `words` parameter is provided, `words` are expected to be already preprocessed results. If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document. If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument. Parameters ---------- words : Iterable[str] a list of words that are already preprocessed raw : str a raw string of document which isn't preprocessed yet. The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set. user_data : Any an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`. **kargs arbitrary keyword arguments for specific topic models ''' return super().add_doc(words, raw, user_data, **kargs)
def concat_ngrams(self, cands, delimiter='_')
-
Added in version: 0.10.0
Concatenate n-gram matched given candidates in the corpus into single word
Parameters
cands
:Iterable[Candidate]
- n-gram candidates to be concatenated. It can be generated by
Corpus.extract_ngrams()
. delimiter
:str
- Delimiter to be used for concatenating words. Default value is
'_'
.
Expand source code
def concat_ngrams(self, cands, delimiter='_'): '''..versionadded:: 0.10.0 Concatenate n-gram matched given candidates in the corpus into single word Parameters ---------- cands : Iterable[tomotopy.label.Candidate] n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`. delimiter : str Delimiter to be used for concatenating words. Default value is `'_'`. ''' return super().concat_ngrams(cands, delimiter)
def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=-inf, normalized=False, workers=0)
-
Added in version: 0.10.0
Extract frequent n-grams using PMI score
Parameters
min_cf
:int
- Minimum collection frequency of n-grams to be extracted
min_df
:int
- Minimum document frequency of n-grams to be extracted
max_len
:int
- Maximum length of n-grams to be extracted
max_cand
:int
- Maximum number of n-grams to be extracted
min_score
:float
- Minium PMI score of n-grams to be extracted
normalized
:bool
- whether to use Normalized PMI or just PMI
workers
:int
- an integer indicating the number of workers to perform samplings.
If
workers
is 0, the number of cores in the system will be used.
Returns
Expand source code
def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0): '''..versionadded:: 0.10.0 Extract frequent n-grams using PMI score Parameters ---------- min_cf : int Minimum collection frequency of n-grams to be extracted min_df : int Minimum document frequency of n-grams to be extracted max_len : int Maximum length of n-grams to be extracted max_cand : int Maximum number of n-grams to be extracted min_score : float Minium PMI score of n-grams to be extracted normalized : bool whether to use Normalized PMI or just PMI workers : int an integer indicating the number of workers to perform samplings. If `workers` is 0, the number of cores in the system will be used. Returns ------- candidates : List[tomotopy.label.Candidate] The extracted n-gram candidates in `tomotopy.label.Candidate` type ''' return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)
def process(self, data_feeder, show_progress=False, total=None)
-
Add multiple documents into the corpus through a given iterator
data_feeder
and return the number of documents inserted.Parameters
data_feeder
:Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
- any iterable yielding a str
raw
, a tuple of (raw
,user_data
) or a tuple of (raw
,user_data
,arbitrary_keyword_args
).
Expand source code
def process(self, data_feeder, show_progress=False, total=None): '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted. Parameters ---------- data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]] any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). ''' if self._tokenizer is None: raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`") num = [0] raw_list = [] metadata_list = [] if show_progress: from tqdm import tqdm data_feeder_iter = iter(tqdm(data_feeder, total=total)) else: data_feeder_iter = iter(data_feeder) def _generate(): for _, d in zip(range(self._batch_size), data_feeder_iter): num[0] += 1 if isinstance(d, tuple) and len(d) == 2: raw_list.append(d[0]) metadata_list.append({}) yield d elif isinstance(d, tuple) and len(d) == 3: raw_list.append(d[0]) metadata_list.append(d[2]) yield d[:2] elif isinstance(d, str): raw_list.append(d) metadata_list.append({}) yield (d, None) else: raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]") while 1: added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list)) if added == 0: break raw_list.clear() metadata_list.clear() return num[0]
def save(self, filename: str, protocol=0)
-
Save the current instance into the file
filename
.Parameters
filename
:str
- a path for the file where the instance is saved
Expand source code
def save(self, filename:str, protocol=0): '''Save the current instance into the file `filename`. Parameters ---------- filename : str a path for the file where the instance is saved ''' import pickle with open(filename, 'wb') as f: pickle.dump(self, f)
class SimpleTokenizer (stemmer=None, pattern: str = None, lowercase=True, ngram_list: Optional[List[str]] = None, ngram_delimiter: str = '_')
-
SimpleTokenizer
provided a simple word-tokenizing utility with an arbitrary stemmer.Parameters
stemmer
:Callable[str, str]
- a callable object for stemming words. If this value is set to
None
, words are not stemmed. pattern
:str
- a regex pattern for extracting tokens
lowercase
:bool
- converts the token into lowercase if this is True
Here is an example of using SimpleTokenizer with NLTK for stemming.
::
import tomotopy as tp # This code requires nltk package for stemming. from nltk.stem.porter import PorterStemmer from nltk.corpus import stopwords stemmer = PorterStemmer() stopwords = set(stopwords.words('english')) corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), stopwords=lambda x: len(x) <= 2 or x in stopwords) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) corpus.process(open(input_file, encoding='utf-8')) # make LDA model and train mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) # extract candidates for auto topic labeling extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) cands = extractor.extract(mdl) labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) for k in range(mdl.k): print("== Topic #{} ==".format(k)) print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) for word, prob in mdl.get_topic_words(k, top_n=10): print(word, prob, sep='\t') print() # Example of Results # ----------------- # == Topic #13 == # Labels: weapon systems, weaponry, anti-aircraft, towed, long-range # aircraft 0.020458335056900978 # use 0.019993379712104797 # airlin 0.012523100711405277 # car 0.012058146297931671 # vehicl 0.01165518444031477 # carrier 0.011531196534633636 # tank 0.011221226304769516 # design 0.010694277472794056 # audi 0.010322313755750656 # martin 0.009981346316635609 # # == Topic #17 == # Labels: American baseball player, American baseball, American actress, singer-songwriter and guitarist, American actor, director, producer, and screenwriter # american 0.04471408948302269 # english 0.01746685802936554 # player 0.01714528724551201 # politician 0.014698212035000324 # footbal 0.012313882820308208 # author 0.010909952223300934 # actor 0.008949155919253826 # french 0.007647186517715454 # academ 0.0073020863346755505 # produc 0.006815808825194836 #
Expand source code
class SimpleTokenizer: '''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.''' def __init__(self, stemmer = None, pattern:str = None, lowercase = True, ngram_list:Optional[List[str]] = None, ngram_delimiter:str = '_', ): '''Parameters ---------- stemmer : Callable[str, str] a callable object for stemming words. If this value is set to `None`, words are not stemmed. pattern : str a regex pattern for extracting tokens lowercase : bool converts the token into lowercase if this is True Here is an example of using SimpleTokenizer with NLTK for stemming. .. include:: ./auto_labeling_code_with_porter.rst ''' self._pat = re.compile(pattern or r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""") if stemmer and not callable(stemmer): raise ValueError("`stemmer` must be callable.") self._stemmer = stemmer or None self._lowercase = lowercase self._ngram_pat = None self._ngram_delimiter = ngram_delimiter if ngram_list: self.build_ngram_pat(ngram_list) def build_ngram_pat(self, ngram_list:List[str]): ngram_vocab = {} patterns = [] for ngram in ngram_list: if self._lowercase: ngram = ngram.lower() words = self._pat.findall(ngram) if len(words) < 2: continue chrs = [] for word in words: if self._stemmer is not None: word = self._stemmer(word) try: wid = ngram_vocab[word] except KeyError: wid = chr(len(ngram_vocab) + 256) ngram_vocab[word] = wid chrs.append(wid) patterns.append(''.join(chrs)) if patterns: self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True))) self._ngram_vocab = ngram_vocab def _tokenize(self, raw:str): if self._ngram_pat is None: for g in self._pat.finditer(raw): start, end = g.span() word = g.group() if self._lowercase: word = word.lower() if self._stemmer is not None: word = self._stemmer(word) yield word, start, end - start else: all_words = [] all_spans = [] chrs = [] for g in self._pat.finditer(raw): all_spans.append(g.span()) word = g.group() if self._lowercase: word = word.lower() if self._stemmer is not None: word = self._stemmer(word) all_words.append(word) try: chrs.append(self._ngram_vocab[word]) except KeyError: chrs.append(' ') chrs = ''.join(chrs) for g in self._ngram_pat.finditer(chrs): s, e = g.span() is_space = all(raw[ns:ne].isspace() for (_, ns), (ne, _) in zip(all_spans[s:e-1], all_spans[s+1:e])) if not is_space: continue all_words[s] = self._ngram_delimiter.join(all_words[s:e]) all_words[s+1:e] = [None] * (e - s - 1) all_spans[s] = (all_spans[s][0], all_spans[e-1][1]) for (s, e), word in zip(all_spans, all_words): if word is None: continue yield word, s, e - s def __call__(self, raw:str, user_data=None): is_iterable = False # test raw is iterable if user_data is None and not isinstance(raw, str): try: iter(raw) is_iterable = True except TypeError: pass if is_iterable: for r, _ in raw: yield list(self._tokenize(r)) else: yield from self._tokenize(raw)
Methods
def build_ngram_pat(self, ngram_list: List[str])
-
Expand source code
def build_ngram_pat(self, ngram_list:List[str]): ngram_vocab = {} patterns = [] for ngram in ngram_list: if self._lowercase: ngram = ngram.lower() words = self._pat.findall(ngram) if len(words) < 2: continue chrs = [] for word in words: if self._stemmer is not None: word = self._stemmer(word) try: wid = ngram_vocab[word] except KeyError: wid = chr(len(ngram_vocab) + 256) ngram_vocab[word] = wid chrs.append(wid) patterns.append(''.join(chrs)) if patterns: self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True))) self._ngram_vocab = ngram_vocab