Module `tomotopy.utils`

Submodule tomotopy.utils provides various utilities for topic modeling. Corpus class helps manage multiple documents easily. The documents inserted into Corpus can be used any topic models, and you can save the corpus preprocessed into a file and load the corpus from a file.

Expand source code

'''
Submodule `tomotopy.utils` provides various utilities for topic modeling. 
`tomotopy.utils.Corpus` class helps manage multiple documents easily. 
The documents inserted into `Corpus` can be used any topic models, and you can save the corpus preprocessed into a file and load the corpus from a file.
'''

class Corpus:
    '''`Corpus` class is a utility that makes it easy to manage large amounts of documents.
    An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.
    '''

    class _VocabDict:
        def __init__(self):
            self.id2word = []
            self.word2id = {}
        
        def to_id(self, word):
            r = self.word2id.get(word, None)
            if not r is None: return r
            r = len(self.word2id)
            self.word2id[word] = r
            self.id2word.append(word)
            return r

        def to_word(self, id_):
            return self.id2word[id_]

    def __init__(self, tokenizer=None, batch_size=64, stopwords=None):
        '''Parameters
----------
tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]
    a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters.
    `tokenizer` receives two arguments `raw` and `user_data` and 
    it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word).
batch_size : int
    `tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]
    When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded.
    If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`.
        '''
        self._docs = []
        self._tokenizer = tokenizer
        self._batch_size = batch_size
        self._vocab = Corpus._VocabDict()
        if callable(stopwords):
            self._stopwords = stopwords
        elif stopwords is None:
            self._stopwords = lambda x: False
        else:
            self._stopwords = lambda x: x in set(stopwords)

    def _select_args_for_model(self, model_type:type, args:dict):
        import tomotopy as tp
        if model_type is tp.DMRModel:
            return {k:v for k, v in args.items() if k in ('metadata')}
        if model_type in (tp.LLDAModel, tp.PLDAModel):
            return {k:v for k, v in args.items() if k in ('labels')}
        if model_type is tp.MGLDAModel:
            return {k:v for k, v in args.items() if k in ('delimiter')}
        if model_type is tp.SLDAModel:
            return {k:v for k, v in args.items() if k in ('y')}
        return {}    
    
    def _feed_docs_to(self, model, transform=None):
        if not self._docs: 
            raise ValueError("Cannot feed zero-size corpus.")
        
        model._update_vocab(self._vocab.id2word)
        transform = transform or (lambda x:x)

        if self._tokenizer:
            for doc in self._docs:
                model._add_doc(doc[0], raw=doc[1], start_pos=doc[2], length=doc[3], **self._select_args_for_model(type(model), transform(doc[4])))
        else:
            for doc in self._docs:
                model._add_doc(doc[0], **self._select_args_for_model(type(model), transform(doc[1])))

    def _tokenize(self, raw, user_data=None):
        tokens, ss, ls = [], [], []
        for t in self._tokenizer(raw, user_data=user_data):
            if type(t) is str:
                if self._stopwords(t): continue
                tokens.append(self._vocab.to_id(t))
            elif type(t) is tuple and len(t) == 3:
                if self._stopwords(t[0]): continue
                tokens.append(self._vocab.to_id(t[0]))
                ss.append(t[1])
                ls.append(t[2])
            else:
                raise ValueError("`tokenizer` must return `str` or `tuple` of (`str`, `int`, `int`).")
        return tokens, ss, ls

    def add_doc(self, words=None, raw=None, user_data=None, **kargs):
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        if self._tokenizer:
            if not words is None: 
                raise ValueError("`raw` is required when `tokenizer` or `batch_tokenizer` is provided.")
            if not type(raw) is str:
                raise ValueError("`raw` must be `str` type.")
            if not raw: return -1
            tokens, ss, ls = self._tokenize(raw, user_data=user_data)
            self._docs.append((tokens, raw, ss, ls, kargs))
        else:
            if not raw is None: 
                raise ValueError("`words` is required when neither `tokenizer` nor `batch_tokenizer` is provided.")
            if type(words) is str:
                raise ValueError("`words` must not be `str`, but `iterable` of `str` type.")
            if not words: return -1
            self._docs.append(([self._vocab.to_id(w) for w in words], kargs))
        return len(self._docs) - 1
    
    def process(self, data_feeder):
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        res = []
        num = 0
        for d in data_feeder:
            num += 1
            if type(d) is tuple and len(d) == 2:
                res.append((*d, {}))
            elif type(d) is tuple and len(d) == 3:
                res.append(d)
            elif type(d) is str:
                res.append((d, None, {}))
            else:
                raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")

            if len(res) >= self._batch_size:
                for raw, user_data, kargs in res:
                    self.add_doc(raw=raw, user_data=user_data, **kargs)
                res.clear()
        
        for raw, user_data, kargs in res:
            self.add_doc(raw=raw, user_data=user_data, **kargs)
        
        return num

    def save(self, filename:str):
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filename:str):
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def __len__(self):
        return len(self._docs)

class SimpleTokenizer:
    '''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.'''
    def __init__(self, stemmer=None):
        '''Parameters
----------
stemmer : Callable[str, str]
    a callable object for stemming words. If this value is set to `None`, words are not stemmed.

Here is an example of using SimpleTokenizer with NLTK for stemming.

.. include:: ./auto_labeling_code_with_porter.rst
'''
        import re
        self._pat = re.compile(r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""")
        self._stemmer = stemmer or (lambda x:x)

    def __call__(self, raw:str, user_data=None):
        for g in self._pat.finditer(raw.lower()):
            start, end = g.span(0)
            yield self._stemmer(g.group(0)), start, end - start

import os
if os.environ.get('TOMOTOPY_LANG') == 'kr':
    __doc__ = """`tomotopy.utils` 서브모듈은 토픽 모델링에 유용한 여러 유틸리티를 제공합니다. 
`tomotopy.utils.Corpus` 클래스는 대량의 문헌을 관리할 수 있게 돕습니다. `Corpus`에 입력된 문헌들은 다양한 토픽 모델에 바로 입력될 수 있습니다.
또한 코퍼스 전처리 결과를 파일에 저장함으로써 필요에 따라 다시 코퍼스를 파일에서 읽어들여 원하는 토픽 모델에 입력할 수 있습니다.
    """
    __pdoc__ = {}
    __pdoc__['Corpus'] = """`Corpus`는 대량의 문헌을 간편하게 다룰 수 있게 도와주는 유틸리티 클래스입니다.
    `Corpus` 클래스의 인스턴스는 여러 개의 문헌을 포함할 수 있으며, 토픽 모델 클래스에 파라미터로 직접 넘겨질 수 있습니다.

Parameters
----------
tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]
    비정제 문헌을 처리하는 데에 사용되는 호출 가능한 객체. `tokenizer`가 None이 아닌 값으로 주어진 경우, `tomotopy.utils.Corpus.add_doc` 메소드를 호출할 때 `raw` 및 `user_data` 파라미터를 사용할 수 있습니다.
    `tokenizer`는 인수로 `raw`와 `user_data` 2개를 받으며, 처리 결과로 `str`(정제된 단어) 혹은 Tuple[`str`, `int`, `int`] (정제된 단어, 단어 시작 위치, 단어 길이)의 iterable을 반환해야 합니다.
batch_size : int
    `tomotopy.utils.Corpus.process` 메소드는 대량의 문헌을 읽어들인 후 `tomotopy.utils.Corpus.add_doc`으로 넘깁니다. 이 때 한번에 읽어들이는 문헌의 개수를 `batch_size`로 지정할 수 있습니다.
stopwords : Iterable[str]
    `tomotopy.utils.Corpus.add_doc`가 호출될 때, `stopwords`에 포함된 단어들은 처리 단계에서 등록되지 않고 제외됩니다.
    `stopwords`가 호출가능한 경우, `stopwords(word) == True`이면 word는 불용어 처리되어 제외됩니다."""

    __pdoc__['Corpus.add_doc'] = """새 문헌을 코퍼스에 추가하고 추가된 문헌의 인덱스 번호를 반환합니다.
이 메소드는 `words` 파라미터나 `raw`, `user_data` 파라미터 둘 중 하나를 요구합니다.
`words` 파라미터를 사용할 경우, `words`는 이미 전처리된 단어들의 리스트여야 합니다.
`raw` 파라미터를 사용할 경우, `raw`는 정제되기 전 문헌의 str이며, `tokenizer`가 이 비정제문헌을 처리하기 위해 호출됩니다.

만약 `tomotopy.DMRModel`의 `metadata`나 `tomotopy.SLDAModel`의 `y`처럼 특정한 토픽 모델에 필요한 추가 파라미터가 있다면 임의 키워드 인자로 넘겨줄 수 있습니다.

Parameters
----------
words : Iterable[str]
    이미 전처리된 단어들의 리스트
raw : str
    전처리되기 이전의 문헌.
    이 파라미터를 사용하려면 인스턴스 생성시 `tokenizer` 파라미터를 넣어줘야 합니다.
user_data : Any
    `tokenizer`에 넘어가는 유저 데이터.  `raw`와 `user_data` 파라미터가 함께 `tokenizer`로 넘어갑니다.
**kargs
    추가적인 파라미터를 위한 임의 키워드 인자"""
    __pdoc__['Corpus.process'] = """이터레이터 `data_feeder`를 통해 다수의 문헌을 코퍼스에 추가하고, 추가된 문헌의 개수를 반환합니다.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    문자열 `raw`이나, 튜플 (`raw`, `user_data`), 혹은 튜플 (`raw`, `user_data`, `kargs`) 를 반환하는 이터레이터. """
    __pdoc__['Corpus.save'] = """현재 인스턴스를 파일 `filename`에 저장합니다.. 

Parameters
----------
filename : str
    인스턴스가 저장될 파일의 경로"""
    __pdoc__['Corpus.load'] = """파일 `filename`로부터 인스턴스를 읽어들여 반환합니다.

Parameters
----------
filename : str
    읽어들일 파일의 경로"""
    
    __pdoc__['SimpleTokenizer'] = """`SimpleTokenizer`는 임의의 스테머를 사용할 수 있는 단순한 단어 분리 유틸리티입니다.

Parameters
----------
stemmer : Callable[str, str]
    단어를 스테밍하는데 사용되는 호출가능한 객체. 만약 이 값이 `None`이라면 스테밍은 사용되지 않습니다.
    
SimpleTokenizer와 NLTK를 사용하여 스테밍을 하는 예제는 다음과 같습니다.

.. include:: ./auto_labeling_code_with_porter.rst"""

del os

Classes

class Corpus (tokenizer=None, batch_size=64, stopwords=None)

Corpus class is a utility that makes it easy to manage large amounts of documents. An instance of Corpus can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.

Parameters

tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]: a callable object for tokenizing raw documents. If tokenizer is provided, you can use Corpus.add_doc() method with raw and user_data parameters. tokenizer receives two arguments raw and user_data and it should return an iterable of str(the tokenized word) or of Tuple[str, int, int] (the tokenized word, starting position of the word, the length of the word).
batch_size : int: Corpus.process() method reads a bunch of documents and send them to Corpus.add_doc(). batch_size indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]: When calling Corpus.add_doc(), words in stopwords are not added to the document but are excluded. If stopwords is callable, a word is excluded from the document when stopwords(word) == True.

Expand source code

class Corpus:
    '''`Corpus` class is a utility that makes it easy to manage large amounts of documents.
    An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.
    '''

    class _VocabDict:
        def __init__(self):
            self.id2word = []
            self.word2id = {}
        
        def to_id(self, word):
            r = self.word2id.get(word, None)
            if not r is None: return r
            r = len(self.word2id)
            self.word2id[word] = r
            self.id2word.append(word)
            return r

        def to_word(self, id_):
            return self.id2word[id_]

    def __init__(self, tokenizer=None, batch_size=64, stopwords=None):
        '''Parameters
----------
tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]
    a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters.
    `tokenizer` receives two arguments `raw` and `user_data` and 
    it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word).
batch_size : int
    `tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of the bunch.
stopwords : Union[Iterable[str], Callable[str, bool]]
    When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded.
    If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`.
        '''
        self._docs = []
        self._tokenizer = tokenizer
        self._batch_size = batch_size
        self._vocab = Corpus._VocabDict()
        if callable(stopwords):
            self._stopwords = stopwords
        elif stopwords is None:
            self._stopwords = lambda x: False
        else:
            self._stopwords = lambda x: x in set(stopwords)

    def _select_args_for_model(self, model_type:type, args:dict):
        import tomotopy as tp
        if model_type is tp.DMRModel:
            return {k:v for k, v in args.items() if k in ('metadata')}
        if model_type in (tp.LLDAModel, tp.PLDAModel):
            return {k:v for k, v in args.items() if k in ('labels')}
        if model_type is tp.MGLDAModel:
            return {k:v for k, v in args.items() if k in ('delimiter')}
        if model_type is tp.SLDAModel:
            return {k:v for k, v in args.items() if k in ('y')}
        return {}    
    
    def _feed_docs_to(self, model, transform=None):
        if not self._docs: 
            raise ValueError("Cannot feed zero-size corpus.")
        
        model._update_vocab(self._vocab.id2word)
        transform = transform or (lambda x:x)

        if self._tokenizer:
            for doc in self._docs:
                model._add_doc(doc[0], raw=doc[1], start_pos=doc[2], length=doc[3], **self._select_args_for_model(type(model), transform(doc[4])))
        else:
            for doc in self._docs:
                model._add_doc(doc[0], **self._select_args_for_model(type(model), transform(doc[1])))

    def _tokenize(self, raw, user_data=None):
        tokens, ss, ls = [], [], []
        for t in self._tokenizer(raw, user_data=user_data):
            if type(t) is str:
                if self._stopwords(t): continue
                tokens.append(self._vocab.to_id(t))
            elif type(t) is tuple and len(t) == 3:
                if self._stopwords(t[0]): continue
                tokens.append(self._vocab.to_id(t[0]))
                ss.append(t[1])
                ls.append(t[2])
            else:
                raise ValueError("`tokenizer` must return `str` or `tuple` of (`str`, `int`, `int`).")
        return tokens, ss, ls

    def add_doc(self, words=None, raw=None, user_data=None, **kargs):
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        if self._tokenizer:
            if not words is None: 
                raise ValueError("`raw` is required when `tokenizer` or `batch_tokenizer` is provided.")
            if not type(raw) is str:
                raise ValueError("`raw` must be `str` type.")
            if not raw: return -1
            tokens, ss, ls = self._tokenize(raw, user_data=user_data)
            self._docs.append((tokens, raw, ss, ls, kargs))
        else:
            if not raw is None: 
                raise ValueError("`words` is required when neither `tokenizer` nor `batch_tokenizer` is provided.")
            if type(words) is str:
                raise ValueError("`words` must not be `str`, but `iterable` of `str` type.")
            if not words: return -1
            self._docs.append(([self._vocab.to_id(w) for w in words], kargs))
        return len(self._docs) - 1
    
    def process(self, data_feeder):
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        res = []
        num = 0
        for d in data_feeder:
            num += 1
            if type(d) is tuple and len(d) == 2:
                res.append((*d, {}))
            elif type(d) is tuple and len(d) == 3:
                res.append(d)
            elif type(d) is str:
                res.append((d, None, {}))
            else:
                raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")

            if len(res) >= self._batch_size:
                for raw, user_data, kargs in res:
                    self.add_doc(raw=raw, user_data=user_data, **kargs)
                res.clear()
        
        for raw, user_data, kargs in res:
            self.add_doc(raw=raw, user_data=user_data, **kargs)
        
        return num

    def save(self, filename:str):
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filename:str):
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def __len__(self):
        return len(self._docs)

Static methods

def load(filename)

Load and return an instance from the file filename

Parameters

filename : str: a path for the file to be loaded

Expand source code

    @staticmethod
    def load(filename:str):
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            return pickle.load(f)

Methods

def add_doc(self, words=None, raw=None, user_data=None, **kargs)

Add a new document into the corpus and return an index of the inserted document. This method requires either words parameter or raw and user_data parameters. If words parameter is provided, words are expected to be already preprocessed results. If raw parameter is provided, raw is expected to be a raw string of document which isn't preprocessed yet, and tokenizer will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as metadata for DMRModel or y for SLDAModel, you can pass it as an arbitrary keyword argument.

Parameters

words : Iterable[str]: a list of words that are already preprocessed
raw : str: a raw string of document which isn't preprocessed yet. The raw parameter can be used only when the tokenizer parameter of __init__ is set.
user_data : Any: an user data for tokenizer. The raw and user_data parameter are sent to tokenizer.
**kargs: arbitrary keyword arguments for specific topic models

Expand source code

    def add_doc(self, words=None, raw=None, user_data=None, **kargs):
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.DMRModel` or `y` for `tomotopy.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    an user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        if self._tokenizer:
            if not words is None: 
                raise ValueError("`raw` is required when `tokenizer` or `batch_tokenizer` is provided.")
            if not type(raw) is str:
                raise ValueError("`raw` must be `str` type.")
            if not raw: return -1
            tokens, ss, ls = self._tokenize(raw, user_data=user_data)
            self._docs.append((tokens, raw, ss, ls, kargs))
        else:
            if not raw is None: 
                raise ValueError("`words` is required when neither `tokenizer` nor `batch_tokenizer` is provided.")
            if type(words) is str:
                raise ValueError("`words` must not be `str`, but `iterable` of `str` type.")
            if not words: return -1
            self._docs.append(([self._vocab.to_id(w) for w in words], kargs))
        return len(self._docs) - 1

def process(self, data_feeder)

Add multiple documents into the corpus through a given iterator data_feeder and return the number of documents inserted.

Parameters

data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]: any iterable yielding a str raw, a tuple of (raw, user_data) or a tuple of (raw, user_data, arbitrary_keyword_args).

Expand source code

    def process(self, data_feeder):
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        res = []
        num = 0
        for d in data_feeder:
            num += 1
            if type(d) is tuple and len(d) == 2:
                res.append((*d, {}))
            elif type(d) is tuple and len(d) == 3:
                res.append(d)
            elif type(d) is str:
                res.append((d, None, {}))
            else:
                raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")

            if len(res) >= self._batch_size:
                for raw, user_data, kargs in res:
                    self.add_doc(raw=raw, user_data=user_data, **kargs)
                res.clear()
        
        for raw, user_data, kargs in res:
            self.add_doc(raw=raw, user_data=user_data, **kargs)
        
        return num

def save(self, filename)

Save the current instance into the file filename.

Parameters

filename : str: a path for the file where the instance is saved

Expand source code

    def save(self, filename:str):
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

class SimpleTokenizer (stemmer=None)

SimpleTokenizer provided a simple word-tokenizing utility with an arbitrary stemmer.

Parameters

stemmer : Callable[str, str]: a callable object for stemming words. If this value is set to None, words are not stemmed.

Here is an example of using SimpleTokenizer with NLTK for stemming.

import tomotopy as tp

# This code requires nltk package for stemming.
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))
corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
    stopwords=lambda x: len(x) <= 2 or x in stopwords)
# data_feeder yields a tuple of (raw string, user data) or a str (raw string)
corpus.process(open(input_file, encoding='utf-8'))

# make LDA model and train
mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
mdl.train(0)
print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
print('Removed top words:', mdl.removed_top_words)
for i in range(0, 1000, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
cands = extractor.extract(mdl)

labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
for k in range(mdl.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in mdl.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

# Example of Results
# -----------------
# == Topic #13 ==
# Labels: weapon systems, weaponry, anti-aircraft, towed, long-range
# aircraft        0.020458335056900978
# use     0.019993379712104797
# airlin  0.012523100711405277
# car     0.012058146297931671
# vehicl  0.01165518444031477
# carrier 0.011531196534633636
# tank    0.011221226304769516
# design  0.010694277472794056
# audi    0.010322313755750656
# martin  0.009981346316635609
# 
# == Topic #17 ==
# Labels: American baseball player, American baseball, American actress, singer-songwriter and guitarist, American actor, director, producer, and screenwriter
# american        0.04471408948302269
# english 0.01746685802936554
# player  0.01714528724551201
# politician      0.014698212035000324
# footbal 0.012313882820308208
# author  0.010909952223300934
# actor   0.008949155919253826
# french  0.007647186517715454
# academ  0.0073020863346755505
# produc  0.006815808825194836
#

Expand source code

class SimpleTokenizer:
    '''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.'''
    def __init__(self, stemmer=None):
        '''Parameters
----------
stemmer : Callable[str, str]
    a callable object for stemming words. If this value is set to `None`, words are not stemmed.

Here is an example of using SimpleTokenizer with NLTK for stemming.

.. include:: ./auto_labeling_code_with_porter.rst
'''
        import re
        self._pat = re.compile(r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""")
        self._stemmer = stemmer or (lambda x:x)

    def __call__(self, raw:str, user_data=None):
        for g in self._pat.finditer(raw.lower()):
            start, end = g.span(0)
            yield self._stemmer(g.group(0)), start, end - start