Module `tomotopy.utils`

Expand source code

from typing import Optional, List

import re

'''
Submodule `tomotopy.utils` provides various utilities for topic modeling. 
`tomotopy.utils.Corpus` class helps manage multiple documents easily. 
The documents inserted into `Corpus` can be used any topic models, and you can save the corpus preprocessed into a file and load the corpus from a file.
'''

from _tomotopy import (_UtilsCorpus, _UtilsVocabDict)

class Corpus(_UtilsCorpus):
    '''`Corpus` class is a utility that makes it easy to manage large amounts of documents.
    An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.
    '''
    class _VocabDict(_UtilsVocabDict):
        pass
    def __init__(self, tokenizer=None, batch_size=64, stopwords=None):
        '''Parameters
----------
tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]
    a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters.
    `tokenizer` receives two arguments `raw` and `user_data` and 
    it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word).
batch_size : int
    `tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]
    When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded.
    If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`.
        '''
        super().__init__()
        self._tokenizer = tokenizer
        self._batch_size = batch_size
        if callable(stopwords):
            self._stopwords = stopwords
        elif stopwords is None:
            self._stopwords = None
        else:
            self._stopwords = lambda x: x in set(stopwords)

    def _select_args_for_model(self, model_type:type, args:dict):
        import tomotopy as tp
        if model_type in (tp.DMRModel, tp.GDMRModel):
            return {k:v for k, v in args.items() if k in ('metadata')}
        if model_type in (tp.LLDAModel, tp.PLDAModel):
            return {k:v for k, v in args.items() if k in ('labels')}
        if model_type is tp.MGLDAModel:
            return {k:v for k, v in args.items() if k in ('delimiter')}
        if model_type is tp.SLDAModel:
            return {k:v for k, v in args.items() if k in ('y')}
        if model_type is tp.DTModel:
            return {k:v for k, v in args.items() if k in ('timepoint')}
        return {}    

    def add_doc(self, words=None, raw=None, user_data=None, **kargs):
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        return super().add_doc(words, raw, user_data, **kargs)

    def process(self, data_feeder, show_progress=False, total=None):
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        if self._tokenizer is None:
            raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`")
        
        num = [0]
        raw_list = []
        metadata_list = []
        if show_progress:
            from tqdm import tqdm
            data_feeder_iter = iter(tqdm(data_feeder, total=total))
        else:
            data_feeder_iter = iter(data_feeder)
        def _generate():
            for _, d in zip(range(self._batch_size), data_feeder_iter):
                num[0] += 1
                if isinstance(d, tuple) and len(d) == 2:
                    raw_list.append(d[0])
                    metadata_list.append({})
                    yield d
                elif isinstance(d, tuple) and len(d) == 3:
                    raw_list.append(d[0])
                    metadata_list.append(d[2])
                    yield d[:2]
                elif isinstance(d, str):
                    raw_list.append(d)
                    metadata_list.append({})
                    yield (d, None)
                else:
                    raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")    
        
        while 1:
            added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list))
            if added == 0: break
            raw_list.clear()
            metadata_list.clear()
        return num[0]

    def save(self, filename:str, protocol=0):
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filename:str):
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
        obj._stopwords = None
        return obj

    def __len__(self):
        return super().__len__()
    
    def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0):
        '''..versionadded:: 0.10.0

Extract frequent n-grams using PMI score

Parameters
----------
min_cf : int
    Minimum collection frequency of n-grams to be extracted
min_df : int
    Minimum document frequency of n-grams to be extracted
max_len : int
    Maximum length of n-grams to be extracted
max_cand : int
    Maximum number of n-grams to be extracted
min_score : float
    Minium PMI score of n-grams to be extracted
normalized : bool
    whether to use Normalized PMI or just PMI
workers : int
    an integer indicating the number of workers to perform samplings. 
    If `workers` is 0, the number of cores in the system will be used.

Returns
-------
candidates : List[tomotopy.label.Candidate]
    The extracted n-gram candidates in `tomotopy.label.Candidate` type
        '''
        return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)
    
    def concat_ngrams(self, cands, delimiter='_'):
        '''..versionadded:: 0.10.0

Concatenate n-gram matched given candidates in the corpus into single word

Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
    n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`.
delimiter : str
    Delimiter to be used for concatenating words. Default value is `'_'`.
        '''
        return super().concat_ngrams(cands, delimiter)

class SimpleTokenizer:
    '''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.'''
    def __init__(self, 
                 stemmer = None, 
                 pattern:str = None, 
                 lowercase = True, 
                 ngram_list:Optional[List[str]] = None,
                 ngram_delimiter:str = '_',
                 ):
        '''Parameters
----------
stemmer : Callable[str, str]
    a callable object for stemming words. If this value is set to `None`, words are not stemmed.
pattern : str
    a regex pattern for extracting tokens
lowercase : bool
    converts the token into lowercase if this is True

Here is an example of using SimpleTokenizer with NLTK for stemming.

.. include:: ./auto_labeling_code_with_porter.rst
'''
        self._pat = re.compile(pattern or r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""")
        if stemmer and not callable(stemmer):
            raise ValueError("`stemmer` must be callable.")
        self._stemmer = stemmer or None
        self._lowercase = lowercase
        self._ngram_pat = None
        self._ngram_delimiter = ngram_delimiter
        if ngram_list:
            self.build_ngram_pat(ngram_list)

    def build_ngram_pat(self, ngram_list:List[str]):
        ngram_vocab = {}
        patterns = []

        for ngram in ngram_list:
            if self._lowercase:
                ngram = ngram.lower()
            words = self._pat.findall(ngram)
            if len(words) < 2:
                continue
            chrs = []
            for word in words:
                if self._stemmer is not None:
                    word = self._stemmer(word)
                try:
                    wid = ngram_vocab[word]
                except KeyError:
                    wid = chr(len(ngram_vocab) + 256)
                    ngram_vocab[word] = wid
                chrs.append(wid)
            patterns.append(''.join(chrs))
        
        if patterns:
            self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True)))
            self._ngram_vocab = ngram_vocab

    def _tokenize(self, raw:str):
        if self._ngram_pat is None:
            for g in self._pat.finditer(raw):
                start, end = g.span()
                word = g.group()
                if self._lowercase: 
                    word = word.lower()
                if self._stemmer is not None:
                    word = self._stemmer(word)
                yield word, start, end - start
        else:
            all_words = []
            all_spans = []
            chrs = []
            for g in self._pat.finditer(raw):
                all_spans.append(g.span())
                word = g.group()
                if self._lowercase: 
                    word = word.lower()
                if self._stemmer is not None:
                    word = self._stemmer(word)
                all_words.append(word)
                try:
                    chrs.append(self._ngram_vocab[word])
                except KeyError:
                    chrs.append(' ')
            chrs = ''.join(chrs)
            for g in self._ngram_pat.finditer(chrs):
                s, e = g.span()
                is_space = all(raw[ns:ne].isspace() for (_, ns), (ne, _) in zip(all_spans[s:e-1], all_spans[s+1:e]))
                if not is_space:
                    continue
                all_words[s] = self._ngram_delimiter.join(all_words[s:e])
                all_words[s+1:e] = [None] * (e - s - 1)
                all_spans[s] = (all_spans[s][0], all_spans[e-1][1])

            for (s, e), word in zip(all_spans, all_words):
                if word is None: continue
                yield word, s, e - s

    def __call__(self, raw:str, user_data=None):
        is_iterable = False
        # test raw is iterable
        if user_data is None and not isinstance(raw, str):
            try:
                iter(raw)
                is_iterable = True
            except TypeError:
                pass
        if is_iterable:
            for r, _ in raw:
                yield list(self._tokenize(r))
        else:
            yield from self._tokenize(raw)

import os
if os.environ.get('TOMOTOPY_LANG') == 'kr':
    __doc__ = """`tomotopy.utils` 서브모듈은 토픽 모델링에 유용한 여러 유틸리티를 제공합니다. 
`tomotopy.utils.Corpus` 클래스는 대량의 문헌을 관리할 수 있게 돕습니다. `Corpus`에 입력된 문헌들은 다양한 토픽 모델에 바로 입력될 수 있습니다.
또한 코퍼스 전처리 결과를 파일에 저장함으로써 필요에 따라 다시 코퍼스를 파일에서 읽어들여 원하는 토픽 모델에 입력할 수 있습니다.
    """
    __pdoc__ = {}
    __pdoc__['Corpus'] = """`Corpus`는 대량의 문헌을 간편하게 다룰 수 있게 도와주는 유틸리티 클래스입니다.
    `Corpus` 클래스의 인스턴스는 여러 개의 문헌을 포함할 수 있으며, 토픽 모델 클래스에 파라미터로 직접 넘겨질 수 있습니다.

Parameters
----------
tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]
    비정제 문헌을 처리하는 데에 사용되는 호출 가능한 객체. `tokenizer`가 None이 아닌 값으로 주어진 경우, `tomotopy.utils.Corpus.add_doc` 메소드를 호출할 때 `raw` 및 `user_data` 파라미터를 사용할 수 있습니다.
    `tokenizer`는 인수로 `raw`와 `user_data` 2개를 받으며, 처리 결과로 `str`(정제된 단어) 혹은 Tuple[`str`, `int`, `int`] (정제된 단어, 단어 시작 위치, 단어 길이)의 iterable을 반환해야 합니다.
batch_size : int
    `tomotopy.utils.Corpus.process` 메소드는 대량의 문헌을 읽어들인 후 `tomotopy.utils.Corpus.add_doc`으로 넘깁니다. 이 때 한번에 읽어들이는 문헌의 개수를 `batch_size`로 지정할 수 있습니다.
stopwords : Iterable[str]
    `tomotopy.utils.Corpus.add_doc`가 호출될 때, `stopwords`에 포함된 단어들은 처리 단계에서 등록되지 않고 제외됩니다.
    `stopwords`가 호출가능한 경우, `stopwords(word) == True`이면 word는 불용어 처리되어 제외됩니다."""

    __pdoc__['Corpus.add_doc'] = """새 문헌을 코퍼스에 추가하고 추가된 문헌의 인덱스 번호를 반환합니다.
이 메소드는 `words` 파라미터나 `raw`, `user_data` 파라미터 둘 중 하나를 요구합니다.
`words` 파라미터를 사용할 경우, `words`는 이미 전처리된 단어들의 리스트여야 합니다.
`raw` 파라미터를 사용할 경우, `raw`는 정제되기 전 문헌의 str이며, `tokenizer`가 이 비정제문헌을 처리하기 위해 호출됩니다.

만약 `tomotopy.DMRModel`의 `metadata`나 `tomotopy.SLDAModel`의 `y`처럼 특정한 토픽 모델에 필요한 추가 파라미터가 있다면 임의 키워드 인자로 넘겨줄 수 있습니다.

Parameters
----------
words : Iterable[str]
    이미 전처리된 단어들의 리스트
raw : str
    전처리되기 이전의 문헌.
    이 파라미터를 사용하려면 인스턴스 생성시 `tokenizer` 파라미터를 넣어줘야 합니다.
user_data : Any
    `tokenizer`에 넘어가는 유저 데이터.  `raw`와 `user_data` 파라미터가 함께 `tokenizer`로 넘어갑니다.
**kargs
    추가적인 파라미터를 위한 임의 키워드 인자"""
    __pdoc__['Corpus.process'] = """이터레이터 `data_feeder`를 통해 다수의 문헌을 코퍼스에 추가하고, 추가된 문헌의 개수를 반환합니다.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    문자열 `raw`이나, 튜플 (`raw`, `user_data`), 혹은 튜플 (`raw`, `user_data`, `kargs`) 를 반환하는 이터레이터. """
    __pdoc__['Corpus.save'] = """현재 인스턴스를 파일 `filename`에 저장합니다.. 

Parameters
----------
filename : str
    인스턴스가 저장될 파일의 경로"""
    __pdoc__['Corpus.load'] = """파일 `filename`로부터 인스턴스를 읽어들여 반환합니다.

Parameters
----------
filename : str
    읽어들일 파일의 경로"""
    __pdoc__['Corpus.extract_ngrams'] = '''..versionadded:: 0.10.0

PMI 점수를 이용해 자주 등장하는 n-gram들을 추출합니다.

Parameters
----------
min_cf : int
    추출할 n-gram의 최소 장서빈도
min_df : int
    추출할 n-gram의 최소 문헌빈도
max_len : int
    추출할 n-gram의 최대 길이
max_cand : int
    추출할 n-gram의 갯수
min_score : float
    추출할 n-gram의 최소 PMI 점수

Returns
-------
candidates : List[tomotopy.label.Candidate]
    추출된 n-gram 후보의 리스트. `tomotopy.label.Candidate` 타입
'''
    __pdoc__['Corpus.concat_ngrams'] = '''..versionadded:: 0.10.0

코퍼스 내에서 주어진 n-gram 목록과 일치하는 단어열을 하나의 단어로 합칩니다.

Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
    합칠 n-gram의 List. `tomotopy.utils.Corpus.extract_ngrams`로 생성할 수 있습니다.
delimiter : str
    여러 단어들을 연결할 때 사용할 구분자. 기본값은 `'_'`입니다.
'''

    __pdoc__['SimpleTokenizer'] = """`SimpleTokenizer`는 임의의 스테머를 사용할 수 있는 단순한 단어 분리 유틸리티입니다.

Parameters
----------
stemmer : Callable[str, str]
    단어를 스테밍하는데 사용되는 호출가능한 객체. 만약 이 값이 `None`이라면 스테밍은 사용되지 않습니다.
pattern : str
    토큰을 추출하는데 사용할 정규식 패턴
lowercase : bool
    참일 경우 분리된 단어들을 소문자화합니다.

SimpleTokenizer와 NLTK를 사용하여 스테밍을 하는 예제는 다음과 같습니다.

.. include:: ./auto_labeling_code_with_porter.rst"""

del os

Classes

class Corpus (tokenizer=None, batch_size=64, stopwords=None)

Corpus class is a utility that makes it easy to manage large amounts of documents. An instance of Corpus can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.

Parameters

tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]: a callable object for tokenizing raw documents. If tokenizer is provided, you can use Corpus.add_doc() method with raw and user_data parameters. tokenizer receives two arguments raw and user_data and it should return an iterable of str(the tokenized word) or of Tuple[str, int, int] (the tokenized word, starting position of the word, the length of the word).
batch_size : int: Corpus.process() method reads a bunch of documents and send them to Corpus.add_doc(). batch_size indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]: When calling Corpus.add_doc(), words in stopwords are not added to the document but are excluded. If stopwords is callable, a word is excluded from the document when stopwords(word) == True.

Expand source code

class Corpus(_UtilsCorpus):
    '''`Corpus` class is a utility that makes it easy to manage large amounts of documents.
    An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.
    '''
    class _VocabDict(_UtilsVocabDict):
        pass
    def __init__(self, tokenizer=None, batch_size=64, stopwords=None):
        '''Parameters
----------
tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]
    a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters.
    `tokenizer` receives two arguments `raw` and `user_data` and 
    it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word).
batch_size : int
    `tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]
    When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded.
    If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`.
        '''
        super().__init__()
        self._tokenizer = tokenizer
        self._batch_size = batch_size
        if callable(stopwords):
            self._stopwords = stopwords
        elif stopwords is None:
            self._stopwords = None
        else:
            self._stopwords = lambda x: x in set(stopwords)

    def _select_args_for_model(self, model_type:type, args:dict):
        import tomotopy as tp
        if model_type in (tp.DMRModel, tp.GDMRModel):
            return {k:v for k, v in args.items() if k in ('metadata')}
        if model_type in (tp.LLDAModel, tp.PLDAModel):
            return {k:v for k, v in args.items() if k in ('labels')}
        if model_type is tp.MGLDAModel:
            return {k:v for k, v in args.items() if k in ('delimiter')}
        if model_type is tp.SLDAModel:
            return {k:v for k, v in args.items() if k in ('y')}
        if model_type is tp.DTModel:
            return {k:v for k, v in args.items() if k in ('timepoint')}
        return {}    

    def add_doc(self, words=None, raw=None, user_data=None, **kargs):
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        return super().add_doc(words, raw, user_data, **kargs)

    def process(self, data_feeder, show_progress=False, total=None):
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        if self._tokenizer is None:
            raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`")
        
        num = [0]
        raw_list = []
        metadata_list = []
        if show_progress:
            from tqdm import tqdm
            data_feeder_iter = iter(tqdm(data_feeder, total=total))
        else:
            data_feeder_iter = iter(data_feeder)
        def _generate():
            for _, d in zip(range(self._batch_size), data_feeder_iter):
                num[0] += 1
                if isinstance(d, tuple) and len(d) == 2:
                    raw_list.append(d[0])
                    metadata_list.append({})
                    yield d
                elif isinstance(d, tuple) and len(d) == 3:
                    raw_list.append(d[0])
                    metadata_list.append(d[2])
                    yield d[:2]
                elif isinstance(d, str):
                    raw_list.append(d)
                    metadata_list.append({})
                    yield (d, None)
                else:
                    raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")    
        
        while 1:
            added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list))
            if added == 0: break
            raw_list.clear()
            metadata_list.clear()
        return num[0]

    def save(self, filename:str, protocol=0):
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filename:str):
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
        obj._stopwords = None
        return obj

    def __len__(self):
        return super().__len__()
    
    def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0):
        '''..versionadded:: 0.10.0

Extract frequent n-grams using PMI score

Parameters
----------
min_cf : int
    Minimum collection frequency of n-grams to be extracted
min_df : int
    Minimum document frequency of n-grams to be extracted
max_len : int
    Maximum length of n-grams to be extracted
max_cand : int
    Maximum number of n-grams to be extracted
min_score : float
    Minium PMI score of n-grams to be extracted
normalized : bool
    whether to use Normalized PMI or just PMI
workers : int
    an integer indicating the number of workers to perform samplings. 
    If `workers` is 0, the number of cores in the system will be used.

Returns
-------
candidates : List[tomotopy.label.Candidate]
    The extracted n-gram candidates in `tomotopy.label.Candidate` type
        '''
        return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)
    
    def concat_ngrams(self, cands, delimiter='_'):
        '''..versionadded:: 0.10.0

Concatenate n-gram matched given candidates in the corpus into single word

Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
    n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`.
delimiter : str
    Delimiter to be used for concatenating words. Default value is `'_'`.
        '''
        return super().concat_ngrams(cands, delimiter)

Ancestors

tomotopy._UtilsCorpus

Static methods

def load(filename: str)

Load and return an instance from the file filename

Parameters

filename : str: a path for the file to be loaded

Expand source code

    @staticmethod
    def load(filename:str):
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
        obj._stopwords = None
        return obj

Methods

def add_doc(self, words=None, raw=None, user_data=None, **kargs)

Add a new document into the corpus and return an index of the inserted document. This method requires either words parameter or raw and user_data parameters. If words parameter is provided, words are expected to be already preprocessed results. If raw parameter is provided, raw is expected to be a raw string of document which isn't preprocessed yet, and tokenizer will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as metadata for DMRModel or y for SLDAModel, you can pass it as an arbitrary keyword argument.

Parameters

words : Iterable[str]: a list of words that are already preprocessed
raw : str: a raw string of document which isn't preprocessed yet. The raw parameter can be used only when the tokenizer parameter of __init__ is set.
user_data : Any: an user data for tokenizer. The raw and user_data parameter are sent to tokenizer.
**kargs: arbitrary keyword arguments for specific topic models

Expand source code

    def add_doc(self, words=None, raw=None, user_data=None, **kargs):
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        return super().add_doc(words, raw, user_data, **kargs)

def concat_ngrams(self, cands, delimiter='_')

Added in version: 0.10.0

Concatenate n-gram matched given candidates in the corpus into single word

Parameters

cands : Iterable[Candidate]: n-gram candidates to be concatenated. It can be generated by Corpus.extract_ngrams().
delimiter : str: Delimiter to be used for concatenating words. Default value is '_'.

Expand source code

    def concat_ngrams(self, cands, delimiter='_'):
        '''..versionadded:: 0.10.0

Concatenate n-gram matched given candidates in the corpus into single word

Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
    n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`.
delimiter : str
    Delimiter to be used for concatenating words. Default value is `'_'`.
        '''
        return super().concat_ngrams(cands, delimiter)

def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=-inf, normalized=False, workers=0)

Added in version: 0.10.0

Extract frequent n-grams using PMI score

Parameters

min_cf : int: Minimum collection frequency of n-grams to be extracted
min_df : int: Minimum document frequency of n-grams to be extracted
max_len : int: Maximum length of n-grams to be extracted
max_cand : int: Maximum number of n-grams to be extracted
min_score : float: Minium PMI score of n-grams to be extracted
normalized : bool: whether to use Normalized PMI or just PMI
workers : int: an integer indicating the number of workers to perform samplings. If workers is 0, the number of cores in the system will be used.

Returns

candidates : List[Candidate]: The extracted n-gram candidates in Candidate type

Expand source code

    def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0):
        '''..versionadded:: 0.10.0

Extract frequent n-grams using PMI score

Parameters
----------
min_cf : int
    Minimum collection frequency of n-grams to be extracted
min_df : int
    Minimum document frequency of n-grams to be extracted
max_len : int
    Maximum length of n-grams to be extracted
max_cand : int
    Maximum number of n-grams to be extracted
min_score : float
    Minium PMI score of n-grams to be extracted
normalized : bool
    whether to use Normalized PMI or just PMI
workers : int
    an integer indicating the number of workers to perform samplings. 
    If `workers` is 0, the number of cores in the system will be used.

Returns
-------
candidates : List[tomotopy.label.Candidate]
    The extracted n-gram candidates in `tomotopy.label.Candidate` type
        '''
        return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)

def process(self, data_feeder, show_progress=False, total=None)

Add multiple documents into the corpus through a given iterator data_feeder and return the number of documents inserted.

Parameters

data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]: any iterable yielding a str raw, a tuple of (raw, user_data) or a tuple of (raw, user_data, arbitrary_keyword_args).

Expand source code

    def process(self, data_feeder, show_progress=False, total=None):
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        if self._tokenizer is None:
            raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`")
        
        num = [0]
        raw_list = []
        metadata_list = []
        if show_progress:
            from tqdm import tqdm
            data_feeder_iter = iter(tqdm(data_feeder, total=total))
        else:
            data_feeder_iter = iter(data_feeder)
        def _generate():
            for _, d in zip(range(self._batch_size), data_feeder_iter):
                num[0] += 1
                if isinstance(d, tuple) and len(d) == 2:
                    raw_list.append(d[0])
                    metadata_list.append({})
                    yield d
                elif isinstance(d, tuple) and len(d) == 3:
                    raw_list.append(d[0])
                    metadata_list.append(d[2])
                    yield d[:2]
                elif isinstance(d, str):
                    raw_list.append(d)
                    metadata_list.append({})
                    yield (d, None)
                else:
                    raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")    
        
        while 1:
            added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list))
            if added == 0: break
            raw_list.clear()
            metadata_list.clear()
        return num[0]

def save(self, filename: str, protocol=0)

Save the current instance into the file filename.

Parameters

filename : str: a path for the file where the instance is saved

Expand source code

    def save(self, filename:str, protocol=0):
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

class SimpleTokenizer (stemmer=None, pattern: str = None, lowercase=True, ngram_list: Optional[List[str]] = None, ngram_delimiter: str = '_')

SimpleTokenizer provided a simple word-tokenizing utility with an arbitrary stemmer.

Parameters

stemmer : Callable[str, str]: a callable object for stemming words. If this value is set to None, words are not stemmed.
pattern : str: a regex pattern for extracting tokens
lowercase : bool: converts the token into lowercase if this is True

Here is an example of using SimpleTokenizer with NLTK for stemming.

import tomotopy as tp

# This code requires nltk package for stemming.
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))
corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
    stopwords=lambda x: len(x) <= 2 or x in stopwords)
# data_feeder yields a tuple of (raw string, user data) or a str (raw string)
corpus.process(open(input_file, encoding='utf-8'))

# make LDA model and train
mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
mdl.train(0)
print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
print('Removed top words:', mdl.removed_top_words)
for i in range(0, 1000, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
cands = extractor.extract(mdl)

labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
for k in range(mdl.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in mdl.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

# Example of Results
# -----------------
# == Topic #13 ==
# Labels: weapon systems, weaponry, anti-aircraft, towed, long-range
# aircraft        0.020458335056900978
# use     0.019993379712104797
# airlin  0.012523100711405277
# car     0.012058146297931671
# vehicl  0.01165518444031477
# carrier 0.011531196534633636
# tank    0.011221226304769516
# design  0.010694277472794056
# audi    0.010322313755750656
# martin  0.009981346316635609
# 
# == Topic #17 ==
# Labels: American baseball player, American baseball, American actress, singer-songwriter and guitarist, American actor, director, producer, and screenwriter
# american        0.04471408948302269
# english 0.01746685802936554
# player  0.01714528724551201
# politician      0.014698212035000324
# footbal 0.012313882820308208
# author  0.010909952223300934
# actor   0.008949155919253826
# french  0.007647186517715454
# academ  0.0073020863346755505
# produc  0.006815808825194836
#

Expand source code

class SimpleTokenizer:
    '''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.'''
    def __init__(self, 
                 stemmer = None, 
                 pattern:str = None, 
                 lowercase = True, 
                 ngram_list:Optional[List[str]] = None,
                 ngram_delimiter:str = '_',
                 ):
        '''Parameters
----------
stemmer : Callable[str, str]
    a callable object for stemming words. If this value is set to `None`, words are not stemmed.
pattern : str
    a regex pattern for extracting tokens
lowercase : bool
    converts the token into lowercase if this is True

Here is an example of using SimpleTokenizer with NLTK for stemming.

.. include:: ./auto_labeling_code_with_porter.rst
'''
        self._pat = re.compile(pattern or r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""")
        if stemmer and not callable(stemmer):
            raise ValueError("`stemmer` must be callable.")
        self._stemmer = stemmer or None
        self._lowercase = lowercase
        self._ngram_pat = None
        self._ngram_delimiter = ngram_delimiter
        if ngram_list:
            self.build_ngram_pat(ngram_list)

    def build_ngram_pat(self, ngram_list:List[str]):
        ngram_vocab = {}
        patterns = []

        for ngram in ngram_list:
            if self._lowercase:
                ngram = ngram.lower()
            words = self._pat.findall(ngram)
            if len(words) < 2:
                continue
            chrs = []
            for word in words:
                if self._stemmer is not None:
                    word = self._stemmer(word)
                try:
                    wid = ngram_vocab[word]
                except KeyError:
                    wid = chr(len(ngram_vocab) + 256)
                    ngram_vocab[word] = wid
                chrs.append(wid)
            patterns.append(''.join(chrs))
        
        if patterns:
            self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True)))
            self._ngram_vocab = ngram_vocab

    def _tokenize(self, raw:str):
        if self._ngram_pat is None:
            for g in self._pat.finditer(raw):
                start, end = g.span()
                word = g.group()
                if self._lowercase: 
                    word = word.lower()
                if self._stemmer is not None:
                    word = self._stemmer(word)
                yield word, start, end - start
        else:
            all_words = []
            all_spans = []
            chrs = []
            for g in self._pat.finditer(raw):
                all_spans.append(g.span())
                word = g.group()
                if self._lowercase: 
                    word = word.lower()
                if self._stemmer is not None:
                    word = self._stemmer(word)
                all_words.append(word)
                try:
                    chrs.append(self._ngram_vocab[word])
                except KeyError:
                    chrs.append(' ')
            chrs = ''.join(chrs)
            for g in self._ngram_pat.finditer(chrs):
                s, e = g.span()
                is_space = all(raw[ns:ne].isspace() for (_, ns), (ne, _) in zip(all_spans[s:e-1], all_spans[s+1:e]))
                if not is_space:
                    continue
                all_words[s] = self._ngram_delimiter.join(all_words[s:e])
                all_words[s+1:e] = [None] * (e - s - 1)
                all_spans[s] = (all_spans[s][0], all_spans[e-1][1])

            for (s, e), word in zip(all_spans, all_words):
                if word is None: continue
                yield word, s, e - s

    def __call__(self, raw:str, user_data=None):
        is_iterable = False
        # test raw is iterable
        if user_data is None and not isinstance(raw, str):
            try:
                iter(raw)
                is_iterable = True
            except TypeError:
                pass
        if is_iterable:
            for r, _ in raw:
                yield list(self._tokenize(r))
        else:
            yield from self._tokenize(raw)

Methods

def build_ngram_pat(self, ngram_list: List[str])

Expand source code

def build_ngram_pat(self, ngram_list:List[str]):
    ngram_vocab = {}
    patterns = []

    for ngram in ngram_list:
        if self._lowercase:
            ngram = ngram.lower()
        words = self._pat.findall(ngram)
        if len(words) < 2:
            continue
        chrs = []
        for word in words:
            if self._stemmer is not None:
                word = self._stemmer(word)
            try:
                wid = ngram_vocab[word]
            except KeyError:
                wid = chr(len(ngram_vocab) + 256)
                ngram_vocab[word] = wid
            chrs.append(wid)
        patterns.append(''.join(chrs))
    
    if patterns:
        self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True)))
        self._ngram_vocab = ngram_vocab