Module `tomotopy.utils`

tomotopy.utils 서브모듈은 토픽 모델링에 유용한 여러 유틸리티를 제공합니다. Corpus 클래스는 대량의 문헌을 관리할 수 있게 돕습니다. Corpus에 입력된 문헌들은 다양한 토픽 모델에 바로 입력될 수 있습니다. 또한 코퍼스 전처리 결과를 파일에 저장함으로써 필요에 따라 다시 코퍼스를 파일에서 읽어들여 원하는 토픽 모델에 입력할 수 있습니다.

Classes

class Corpus (tokenizer=None, batch_size=64, stopwords=None)

Expand source code

class Corpus(_UtilsCorpus):
    '''`Corpus` class is a utility that makes it easy to manage large amounts of documents.
    An instance of `Corpus` can contain multiple preprocessed documents, and can be used directly by passing them as parameters of the topic modeling classes.
    '''
    class _VocabDict(_UtilsVocabDict):
        pass

    def __init__(self, tokenizer=None, batch_size=64, stopwords=None):
        '''Parameters
----------
tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]
    a callable object for tokenizing raw documents. If `tokenizer` is provided, you can use `tomotopy.utils.Corpus.add_doc` method with `raw` and `user_data` parameters.
    `tokenizer` receives two arguments `raw` and `user_data` and 
    it should return an iterable of `str`(the tokenized word) or of Tuple[`str`, `int`, `int`] (the tokenized word, starting position of the word, the length of the word).
batch_size : int
    `tomotopy.utils.Corpus.process` method reads a bunch of documents and send them to `tomotopy.utils.Corpus.add_doc`. `batch_size` indicates the size of each batch.
stopwords : Union[Iterable[str], Callable[str, bool]]
    When calling `tomotopy.utils.Corpus.add_doc`, words in `stopwords` are not added to the document but are excluded.
    If `stopwords` is callable, a word is excluded from the document when `stopwords(word) == True`.
        '''
        super().__init__(self._VocabDict, None)
        self._tokenizer = tokenizer
        self._batch_size = batch_size
        if callable(stopwords):
            self._stopwords = stopwords
        elif stopwords is None:
            self._stopwords = None
        else:
            self._stopwords = lambda x: x in set(stopwords)

    def _select_args_for_model(self, model_type:type, args:dict):
        import tomotopy as tp
        if model_type in (tp.DMRModel, tp.GDMRModel):
            return {k:v for k, v in args.items() if k in ('metadata')}
        if model_type in (tp.LLDAModel, tp.PLDAModel):
            return {k:v for k, v in args.items() if k in ('labels')}
        if model_type is tp.MGLDAModel:
            return {k:v for k, v in args.items() if k in ('delimiter')}
        if model_type is tp.SLDAModel:
            return {k:v for k, v in args.items() if k in ('y')}
        if model_type is tp.DTModel:
            return {k:v for k, v in args.items() if k in ('timepoint')}
        return {}    

    def add_doc(self, words=None, raw=None, user_data=None, **kargs) -> int:
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.models.DMRModel` or `y` for `tomotopy.models.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        return super().add_doc(words, raw, user_data, kargs)

    def process(self, data_feeder, show_progress=False, total=None) -> int:
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        if self._tokenizer is None:
            raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`")
        
        num = [0]
        raw_list = []
        metadata_list = []
        if show_progress:
            from tqdm import tqdm
            data_feeder_iter = iter(tqdm(data_feeder, total=total))
        else:
            data_feeder_iter = iter(data_feeder)
        def _generate():
            for _, d in zip(range(self._batch_size), data_feeder_iter):
                num[0] += 1
                if isinstance(d, tuple) and len(d) == 2:
                    raw_list.append(d[0])
                    metadata_list.append({})
                    yield d
                elif isinstance(d, tuple) and len(d) == 3:
                    raw_list.append(d[0])
                    metadata_list.append(d[2])
                    yield d[:2]
                elif isinstance(d, str):
                    raw_list.append(d)
                    metadata_list.append({})
                    yield (d, None)
                else:
                    raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")    
        
        while 1:
            added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list))
            if added == 0: break
            raw_list.clear()
            metadata_list.clear()
        return num[0]

    def save(self, filename:str, protocol=0) -> None:
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filename:str) -> 'Corpus':
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
        obj._stopwords = None
        return obj

    def __len__(self) -> int:
        return super().__len__()
    
    def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0) -> List:
        '''..versionadded:: 0.10.0

Extract frequent n-grams using PMI score

Parameters
----------
min_cf : int
    Minimum collection frequency of n-grams to be extracted
min_df : int
    Minimum document frequency of n-grams to be extracted
max_len : int
    Maximum length of n-grams to be extracted
max_cand : int
    Maximum number of n-grams to be extracted
min_score : float
    Minimum PMI score of n-grams to be extracted
normalized : bool
    whether to use Normalized PMI or just PMI
workers : int
    an integer indicating the number of workers to perform samplings. 
    If `workers` is 0, the number of cores in the system will be used.

Returns
-------
candidates : List[tomotopy.label.Candidate]
    The extracted n-gram candidates in `tomotopy.label.Candidate` type
        '''
        return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)
    
    def concat_ngrams(self, cands, delimiter='_') -> None:
        '''..versionadded:: 0.10.0

Concatenate n-grams matching the given candidates in the corpus into single words

Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
    n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`.
delimiter : str
    Delimiter to be used for concatenating words. Default value is `'_'`.
        '''
        return super().concat_ngrams(cands, delimiter)

Corpus는 대량의 문헌을 간편하게 다룰 수 있게 도와주는 유틸리티 클래스입니다. Corpus 클래스의 인스턴스는 여러 개의 문헌을 포함할 수 있으며, 토픽 모델 클래스에 파라미터로 직접 넘겨질 수 있습니다.

파라미터

tokenizer : Callable[[str, Any], Iterable[Union[str, Tuple[str, int, int]]]]: 비정제 문헌을 처리하는 데에 사용되는 호출 가능한 객체. tokenizer가 None이 아닌 값으로 주어진 경우, Corpus.add_doc() 메소드를 호출할 때 raw 및 user_data 파라미터를 사용할 수 있습니다. tokenizer는 인수로 raw와 user_data 2개를 받으며, 처리 결과로 str(정제된 단어) 혹은 Tuple[str, int, int] (정제된 단어, 단어 시작 위치, 단어 길이)의 iterable을 반환해야 합니다.
batch_size : int: Corpus.process() 메소드는 대량의 문헌을 읽어들인 후 Corpus.add_doc()으로 넘깁니다. 이 때 한번에 읽어들이는 문헌의 개수를 batch_size로 지정할 수 있습니다.
stopwords : Iterable[str]: Corpus.add_doc()가 호출될 때, stopwords에 포함된 단어들은 처리 단계에서 등록되지 않고 제외됩니다. stopwords가 호출가능한 경우, stopwords(word) == True이면 word는 불용어 처리되어 제외됩니다.

파라미터

tokenizer : Union[Callable[[str, Any], List[Union[str, Tuple[str, int, int]]]], Callable[[Iterable[Tuple[str, Any]]], Iterable[List[Union[str, Tuple[str, int, int]]]]]]: a callable object for tokenizing raw documents. If tokenizer is provided, you can use Corpus.add_doc() method with raw and user_data parameters. tokenizer receives two arguments raw and user_data and it should return an iterable of str(the tokenized word) or of Tuple[str, int, int] (the tokenized word, starting position of the word, the length of the word).
batch_size : int: Corpus.process() method reads a bunch of documents and send them to Corpus.add_doc(). batch_size indicates the size of each batch.
stopwords : Union[Iterable[str], Callable[str, bool]]: When calling Corpus.add_doc(), words in stopwords are not added to the document but are excluded. If stopwords is callable, a word is excluded from the document when stopwords(word) == True.

부모 클래스

tomotopy._UtilsCorpus

Static methods

def load(filename: str) ‑> Corpus

Expand source code

    @staticmethod
    def load(filename:str) -> 'Corpus':
        '''Load and return an instance from the file `filename`

Parameters
----------
filename : str
    a path for the file to be loaded
        '''
        import pickle
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
        obj._stopwords = None
        return obj

파일 filename로부터 인스턴스를 읽어들여 반환합니다.

파라미터

filename : str: 읽어들일 파일의 경로

메소드

def add_doc(self, words=None, raw=None, user_data=None, **kargs) ‑> int

Expand source code

    def add_doc(self, words=None, raw=None, user_data=None, **kargs) -> int:
        '''Add a new document into the corpus and return an index of the inserted document. 
This method requires either `words` parameter or `raw` and `user_data` parameters. 
If `words` parameter is provided, `words` are expected to be already preprocessed results.
If `raw` parameter is provided, `raw` is expected to be a raw string of document which isn't preprocessed yet, and `tokenizer` will be called for preprocessing the raw document.

If you need additional parameters for a specific topic model, such as `metadata` for `tomotopy.models.DMRModel` or `y` for `tomotopy.models.SLDAModel`, you can pass it as an arbitrary keyword argument.

Parameters
----------
words : Iterable[str]
    a list of words that are already preprocessed
raw : str
    a raw string of document which isn't preprocessed yet. 
    The `raw` parameter can be used only when the `tokenizer` parameter of `__init__` is set.
user_data : Any
    user data for `tokenizer`. The `raw` and `user_data` parameter are sent to `tokenizer`.
**kargs
    arbitrary keyword arguments for specific topic models
        '''
        return super().add_doc(words, raw, user_data, kargs)

새 문헌을 코퍼스에 추가하고 추가된 문헌의 인덱스 번호를 반환합니다. 이 메소드는 words 파라미터나 raw, user_data 파라미터 둘 중 하나를 요구합니다. words 파라미터를 사용할 경우, words는 이미 전처리된 단어들의 리스트여야 합니다. raw 파라미터를 사용할 경우, raw는 정제되기 전 문헌의 str이며, tokenizer가 이 비정제문헌을 처리하기 위해 호출됩니다.

만약 DMRModel의 metadata나 SLDAModel의 y처럼 특정한 토픽 모델에 필요한 추가 파라미터가 있다면 임의 키워드 인자로 넘겨줄 수 있습니다.

파라미터

words : Iterable[str]: 이미 전처리된 단어들의 리스트
raw : str: 전처리되기 이전의 문헌. 이 파라미터를 사용하려면 인스턴스 생성시 tokenizer 파라미터를 넣어줘야 합니다.
user_data : Any: tokenizer에 넘어가는 유저 데이터. raw와 user_data 파라미터가 함께 tokenizer로 넘어갑니다.
**kargs: 추가적인 파라미터를 위한 임의 키워드 인자

def concat_ngrams(self, cands, delimiter='_') ‑> None

Expand source code

    def concat_ngrams(self, cands, delimiter='_') -> None:
        '''..versionadded:: 0.10.0

Concatenate n-grams matching the given candidates in the corpus into single words

Parameters
----------
cands : Iterable[tomotopy.label.Candidate]
    n-gram candidates to be concatenated. It can be generated by `tomotopy.utils.Corpus.extract_ngrams`.
delimiter : str
    Delimiter to be used for concatenating words. Default value is `'_'`.
        '''
        return super().concat_ngrams(cands, delimiter)

추가된 버전: 0.10.0

코퍼스 내에서 주어진 n-gram 목록과 일치하는 단어열을 하나의 단어로 합칩니다.

파라미터

cands : Iterable[Candidate]: 합칠 n-gram의 List. Corpus.extract_ngrams()로 생성할 수 있습니다.
delimiter : str: 여러 단어들을 연결할 때 사용할 구분자. 기본값은 '_'입니다.

def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=-inf, normalized=False, workers=0) ‑> List

Expand source code

    def extract_ngrams(self, min_cf=10, min_df=5, max_len=5, max_cand=5000, min_score=float('-inf'), normalized=False, workers=0) -> List:
        '''..versionadded:: 0.10.0

Extract frequent n-grams using PMI score

Parameters
----------
min_cf : int
    Minimum collection frequency of n-grams to be extracted
min_df : int
    Minimum document frequency of n-grams to be extracted
max_len : int
    Maximum length of n-grams to be extracted
max_cand : int
    Maximum number of n-grams to be extracted
min_score : float
    Minimum PMI score of n-grams to be extracted
normalized : bool
    whether to use Normalized PMI or just PMI
workers : int
    an integer indicating the number of workers to perform samplings. 
    If `workers` is 0, the number of cores in the system will be used.

Returns
-------
candidates : List[tomotopy.label.Candidate]
    The extracted n-gram candidates in `tomotopy.label.Candidate` type
        '''
        return super().extract_ngrams(min_cf, min_df, max_len, max_cand, min_score, normalized, workers)

추가된 버전: 0.10.0

PMI 점수를 이용해 자주 등장하는 n-gram들을 추출합니다.

파라미터

min_cf : int: 추출할 n-gram의 최소 장서빈도
min_df : int: 추출할 n-gram의 최소 문헌빈도
max_len : int: 추출할 n-gram의 최대 길이
max_cand : int: 추출할 n-gram의 갯수
min_score : float: 추출할 n-gram의 최소 PMI 점수

Returns

candidates : List[Candidate]: 추출된 n-gram 후보의 리스트. Candidate 타입

def process(self, data_feeder, show_progress=False, total=None) ‑> int

Expand source code

    def process(self, data_feeder, show_progress=False, total=None) -> int:
        '''Add multiple documents into the corpus through a given iterator `data_feeder` and return the number of documents inserted.

Parameters
----------
data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]
    any iterable yielding a str `raw`, a tuple of (`raw`, `user_data`) or a tuple of (`raw`, `user_data`, `arbitrary_keyword_args`). 
        '''
        if self._tokenizer is None:
            raise ValueError("`tokenizer` must be set when using `tomotopy.utils.Corpus.process`")
        
        num = [0]
        raw_list = []
        metadata_list = []
        if show_progress:
            from tqdm import tqdm
            data_feeder_iter = iter(tqdm(data_feeder, total=total))
        else:
            data_feeder_iter = iter(data_feeder)
        def _generate():
            for _, d in zip(range(self._batch_size), data_feeder_iter):
                num[0] += 1
                if isinstance(d, tuple) and len(d) == 2:
                    raw_list.append(d[0])
                    metadata_list.append({})
                    yield d
                elif isinstance(d, tuple) and len(d) == 3:
                    raw_list.append(d[0])
                    metadata_list.append(d[2])
                    yield d[:2]
                elif isinstance(d, str):
                    raw_list.append(d)
                    metadata_list.append({})
                    yield (d, None)
                else:
                    raise ValueError("`data_feeder` must return an iterable of str, of Tuple[str, Any] or Tuple[str, Any, dict]")    
        
        while 1:
            added = super().add_docs(self._tokenizer(_generate()), iter(raw_list), iter(metadata_list))
            if added == 0: break
            raw_list.clear()
            metadata_list.clear()
        return num[0]

이터레이터 data_feeder를 통해 다수의 문헌을 코퍼스에 추가하고, 추가된 문헌의 개수를 반환합니다.

파라미터

data_feeder : Iterable[Union[str, Tuple[str, Any], Tuple[str, Any, dict]]]: 문자열 raw이나, 튜플 (raw, user_data), 혹은 튜플 (raw, user_data, kargs) 를 반환하는 이터레이터.

def save(self, filename: str, protocol=0) ‑> None

Expand source code

    def save(self, filename:str, protocol=0) -> None:
        '''Save the current instance into the file `filename`. 

Parameters
----------
filename : str
    a path for the file where the instance is saved
        '''
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

현재 인스턴스를 파일 filename에 저장합니다..

파라미터

filename : str: 인스턴스가 저장될 파일의 경로

class Document (*args, **kwargs)

Expand source code

class Document(_Document):
    '''This type provides an abstract interface for accessing documents used in topic models.

An instance of this type can be acquired from `tomotopy.models.LDAModel.make_doc` method or `tomotopy.models.LDAModel.docs` member of each Topic Model instance.'''
    
    def get_topics(self, top_n=10, from_pseudo_doc=False) -> List[Tuple[int, float]]:
        '''Return the `top_n` topics with their probabilities for the document.

Parameters
----------
top_n : int
    the `n` in "top-n"
from_pseudo_doc : bool
    .. versionadded:: 0.12.2

    If True, it returns the topic distribution of its pseudo document. Only valid for `tomotopy.models.PTModel`.'''
        return super().get_topics(top_n, from_pseudo_doc)
    
    def get_topic_dist(self, normalize=True, from_pseudo_doc=False) -> List[float]:
        '''Return a distribution of the topics in the document.

Parameters
----------
normalize : bool
    .. versionadded:: 0.11.0

    If True, it returns the probability distribution with the sum being 1. Otherwise it returns the distribution of raw values.
from_pseudo_doc : bool
    .. versionadded:: 0.12.2

    If True, it returns the topic distribution of its pseudo document. Only valid for `tomotopy.models.PTModel`.'''
        return super().get_topic_dist(normalize, from_pseudo_doc)
    
    def get_sub_topics(self, top_n=10) -> List[Tuple[int, float]]:
        '''.. versionadded:: 0.5.0

Return the `top_n` sub topics with their probabilities for the document. (for only `tomotopy.models.PAModel`)'''
        return super().get_sub_topics(top_n)
    
    def get_sub_topic_dist(self, normalize=True) -> List[float]:
        '''.. versionadded:: 0.5.0

Return a distribution of the sub topics in the document. (for only `tomotopy.models.PAModel`)

Parameters
----------
normalize : bool
    .. versionadded:: 0.11.0

    If True, it returns the probability distribution with the sum being 1. Otherwise it returns the distribution of raw values.'''
        return super().get_sub_topic_dist(normalize)
    
    def get_words(self, top_n=10) -> List[Tuple[int, float]]:
        '''.. versionadded:: 0.4.2

Return the `top_n` words with their probabilities for the document.'''
        return super().get_words(top_n)
    
    def get_count_vector(self) -> List[int]:
        '''.. versionadded:: 0.7.0

Return a count vector for the current document.'''
        return super().get_count_vector()
    
    def get_ll(self) -> float:
        '''.. versionadded:: 0.10.0

Return total log-likelihood for the current document.'''
        return super().get_ll()
    
    @property
    def words(self) -> List[int]:
        '''a `list` of IDs for each word (read-only)'''
        return super()._words
    
    @property
    def weights(self) -> List[float]:
        '''a `list` of weights for each word (read-only)'''
        return super()._weights
    
    @property
    def topics(self) -> List[int]:
        '''a `list` of topics for each word (read-only)

This represents super topics in `tomotopy.models.PAModel` and `tomotopy.models.HPAModel` model.'''
        return super()._topics
    
    @property
    def uid(self) -> str:
        '''a unique ID for the document (read-only)'''
        return super()._uid
    
    @property
    def metadata(self) -> str:
        '''categorical metadata of the document (for only `tomotopy.models.DMRModel` and `tomotopy.models.GDMRModel` model, read-only)'''
        return super()._metadata
    
    @property
    def multi_metadata(self) -> List[str]:
        '''categorical multiple metadata of the document (for only `tomotopy.models.DMRModel` and `tomotopy.models.GDMRModel` model, read-only)

.. versionadded:: 0.12.0'''
        return super()._multi_metadata
    
    @property
    def numeric_metadata(self) -> List[float]:
        '''continuous numeric metadata of the document (for only `tomotopy.models.GDMRModel` model, read-only)

.. versionadded:: 0.11.0'''
        return super()._numeric_metadata
    
    @property
    def subtopics(self) -> List[int]:
        '''a `list` of sub topics for each word (for only `tomotopy.models.PAModel` and `tomotopy.models.HPAModel` model, read-only)'''
        return super()._subtopics
    
    @property
    def windows(self) -> List[int]:
        '''a `list` of window IDs for each word (for only `tomotopy.models.MGLDAModel` model, read-only)'''
        return super()._windows
    
    @property
    def paths(self) -> List[int]:
        '''a `list` of topic ids by depth for a given document (for only `tomotopy.models.HLDAModel` model, read-only)

.. versionadded:: 0.7.1'''
        return super()._paths
    
    @property
    def beta(self) -> List[float]:
        '''a `list` of beta parameters for each topic (for only `tomotopy.models.CTModel` model, read-only)

.. versionadded:: 0.2.0'''
        return super()._beta
    
    @property
    def vars(self) -> List[float]:
        '''a `list` of response variables (for only `tomotopy.models.SLDAModel` model, read-only)

.. versionadded:: 0.2.0'''
        return super()._vars
    
    @property
    def labels(self) -> List[Tuple[str, List[float]]]:
        '''a `list` of (label, list of probabilities of each topic belonging to the label) of the document (for only `tomotopy.models.LLDAModel` and `tomotopy.models.PLDAModel` models, read-only)

.. versionadded:: 0.3.0'''
        return super()._labels
    
    @property
    def eta(self) -> List[float]:
        '''a `list` of eta parameters(topic distribution) for the current document (for only `tomotopy.models.DTModel` model, read-only)

.. versionadded:: 0.7.0'''
        return super()._eta
    
    @property
    def timepoint(self) -> int:
        '''a timepoint of the document (for only `tomotopy.models.DTModel` model, read-only)

.. versionadded:: 0.7.0'''
        return super()._timepoint
    
    @property
    def raw(self) -> Optional[str]:
        '''a raw text of the document (read-only)'''
        return super()._raw
        
    @property
    def span(self) -> List[Tuple[int, int]]:
        '''a span (tuple of a start position and an end position in bytes) for each word token in the document (read-only)'''
        return super()._span
    
    @property
    def pseudo_doc_id(self) -> int:
        '''an ID of a pseudo document where the document is allocated to (for only `tomotopy.models.PTModel` model, read-only)

.. versionadded:: 0.11.0'''
        return super()._pseudo_doc_id

이 타입은 토픽 모델에 사용되는 문헌들에 접근할 수 있는 추상 인터페이스을 제공합니다.

부모 클래스

tomotopy._Document

인스턴스 변수

prop beta : List[float]

Expand source code

    @property
    def beta(self) -> List[float]:
        '''a `list` of beta parameters for each topic (for only `tomotopy.models.CTModel` model, read-only)

.. versionadded:: 0.2.0'''
        return super()._beta

문헌의 각 토픽별 beta 파라미터를 보여주는 list (CTModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.2.0

prop eta : List[float]

Expand source code

    @property
    def eta(self) -> List[float]:
        '''a `list` of eta parameters(topic distribution) for the current document (for only `tomotopy.models.DTModel` model, read-only)

.. versionadded:: 0.7.0'''
        return super()._eta

문헌의 eta 파라미터(토픽 분포)를 나타내는 list (DTModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.7.0

prop labels : List[Tuple[str, List[float]]]

Expand source code

    @property
    def labels(self) -> List[Tuple[str, List[float]]]:
        '''a `list` of (label, list of probabilities of each topic belonging to the label) of the document (for only `tomotopy.models.LLDAModel` and `tomotopy.models.PLDAModel` models, read-only)

.. versionadded:: 0.3.0'''
        return super()._labels

문헌에 매겨진 (레이블, 레이블에 속하는 각 주제의 확률들)의 list (LLDAModel, PLDAModel 모형에서만 사용됨 , 읽기전용)

추가된 버전: 0.3.0

prop metadata : str

Expand source code

@property
def metadata(self) -> str:
    '''categorical metadata of the document (for only `tomotopy.models.DMRModel` and `tomotopy.models.GDMRModel` model, read-only)'''
    return super()._metadata

문헌의 범주형 메타데이터 (DMRModel과 GDMRModel 모형에서만 사용됨, 읽기전용)

prop multi_metadata : List[str]

Expand source code

    @property
    def multi_metadata(self) -> List[str]:
        '''categorical multiple metadata of the document (for only `tomotopy.models.DMRModel` and `tomotopy.models.GDMRModel` model, read-only)

.. versionadded:: 0.12.0'''
        return super()._multi_metadata

문헌의 범주형 메타데이터 (DMRModel과 GDMRModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.12.0

prop numeric_metadata : List[float]

Expand source code

    @property
    def numeric_metadata(self) -> List[float]:
        '''continuous numeric metadata of the document (for only `tomotopy.models.GDMRModel` model, read-only)

.. versionadded:: 0.11.0'''
        return super()._numeric_metadata

문헌의 연속형 숫자 메타데이터 (GDMRModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.11.0

prop paths : List[int]

Expand source code

    @property
    def paths(self) -> List[int]:
        '''a `list` of topic ids by depth for a given document (for only `tomotopy.models.HLDAModel` model, read-only)

.. versionadded:: 0.7.1'''
        return super()._paths

주어진 문헌에 대한 깊이별 토픽 번호의 list (HLDAModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.7.1

prop pseudo_doc_id : int

Expand source code

    @property
    def pseudo_doc_id(self) -> int:
        '''an ID of a pseudo document where the document is allocated to (for only `tomotopy.models.PTModel` model, read-only)

.. versionadded:: 0.11.0'''
        return super()._pseudo_doc_id

문헌이 할당된 가상 문헌의 id (PTModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.11.0

prop raw : str | None

Expand source code

@property
def raw(self) -> Optional[str]:
    '''a raw text of the document (read-only)'''
    return super()._raw

문헌의 가공되지 않는 전체 텍스트 (읽기전용)

prop span : List[Tuple[int, int]]

Expand source code

@property
def span(self) -> List[Tuple[int, int]]:
    '''a span (tuple of a start position and an end position in bytes) for each word token in the document (read-only)'''
    return super()._span

문헌의 각 단어 토큰의 구간(바이트 단위 시작 지점과 끝 지점의 tuple) (읽기전용)

prop subtopics : List[int]

Expand source code

@property
def subtopics(self) -> List[int]:
    '''a `list` of sub topics for each word (for only `tomotopy.models.PAModel` and `tomotopy.models.HPAModel` model, read-only)'''
    return super()._subtopics

문헌의 단어들이 각각 할당된 하위 토픽을 보여주는 list (PAModel와 HPAModel 모형에서만 사용됨, 읽기전용)

prop timepoint : int

Expand source code

    @property
    def timepoint(self) -> int:
        '''a timepoint of the document (for only `tomotopy.models.DTModel` model, read-only)

.. versionadded:: 0.7.0'''
        return super()._timepoint

문헌의 시점 (DTModel 모형에서만 사용됨, 읽기전용)

추가된 버전: 0.7.0

prop topics : List[int]

Expand source code

    @property
    def topics(self) -> List[int]:
        '''a `list` of topics for each word (read-only)

This represents super topics in `tomotopy.models.PAModel` and `tomotopy.models.HPAModel` model.'''
        return super()._topics

문헌의 단어들이 각각 할당된 토픽을 보여주는 list (읽기 전용)

PAModel와 HPAModel 모형에서는 이 값이 상위토픽의 ID를 가리킵니다.

prop uid : str

Expand source code

@property
def uid(self) -> str:
    '''a unique ID for the document (read-only)'''
    return super()._uid

문헌의 고유 ID (읽기전용)

prop vars : List[float]

Expand source code

    @property
    def vars(self) -> List[float]:
        '''a `list` of response variables (for only `tomotopy.models.SLDAModel` model, read-only)

.. versionadded:: 0.2.0'''
        return super()._vars

문헌의 응답 변수를 보여주는 list (SLDAModel 모형에서만 사용됨 , 읽기전용)

추가된 버전: 0.2.0

prop weights : List[float]

Expand source code

@property
def weights(self) -> List[float]:
    '''a `list` of weights for each word (read-only)'''
    return super()._weights

문헌의 가중치 (읽기전용)

prop windows : List[int]

Expand source code

@property
def windows(self) -> List[int]:
    '''a `list` of window IDs for each word (for only `tomotopy.models.MGLDAModel` model, read-only)'''
    return super()._windows

문헌의 단어들이 할당된 윈도우의 ID를 보여주는 list (MGLDAModel 모형에서만 사용됨, 읽기전용)

prop words : List[int]

Expand source code

@property
def words(self) -> List[int]:
    '''a `list` of IDs for each word (read-only)'''
    return super()._words

문헌 내 단어들의 ID가 담긴 list (읽기전용)

메소드

def get_count_vector(self) ‑> List[int]

Expand source code

    def get_count_vector(self) -> List[int]:
        '''.. versionadded:: 0.7.0

Return a count vector for the current document.'''
        return super().get_count_vector()

추가된 버전: 0.7.0

현재 문헌의 카운트 벡터를 반환합니다.

def get_ll(self) ‑> float

Expand source code

    def get_ll(self) -> float:
        '''.. versionadded:: 0.10.0

Return total log-likelihood for the current document.'''
        return super().get_ll()

추가된 버전: 0.10.0

현재 문헌의 로그가능도 총합을 반환합니다.

def get_sub_topic_dist(self, normalize=True) ‑> List[float]

Expand source code

    def get_sub_topic_dist(self, normalize=True) -> List[float]:
        '''.. versionadded:: 0.5.0

Return a distribution of the sub topics in the document. (for only `tomotopy.models.PAModel`)

Parameters
----------
normalize : bool
    .. versionadded:: 0.11.0

    If True, it returns the probability distribution with the sum being 1. Otherwise it returns the distribution of raw values.'''
        return super().get_sub_topic_dist(normalize)

추가된 버전: 0.5.0

현재 문헌의 하위 토픽 확률 분포를 list 형태로 반환합니다. (PAModel 전용)

파라미터

normalize : bool: 추가된 버전: 0.11.0

참일 경우 총합이 1이 되는 확률 분포를 반환하고, 거짓일 경우 정규화되지 않는 값을 그대로 반환합니다.

def get_sub_topics(self, top_n=10) ‑> List[Tuple[int, float]]

Expand source code

    def get_sub_topics(self, top_n=10) -> List[Tuple[int, float]]:
        '''.. versionadded:: 0.5.0

Return the `top_n` sub topics with their probabilities for the document. (for only `tomotopy.models.PAModel`)'''
        return super().get_sub_topics(top_n)

추가된 버전: 0.5.0

현재 문헌의 상위 top_n개의 하위 토픽과 그 확률을 tuple의 list 형태로 반환합니다. (PAModel 전용)

def get_topic_dist(self, normalize=True, from_pseudo_doc=False) ‑> List[float]

Expand source code

    def get_topic_dist(self, normalize=True, from_pseudo_doc=False) -> List[float]:
        '''Return a distribution of the topics in the document.

Parameters
----------
normalize : bool
    .. versionadded:: 0.11.0

    If True, it returns the probability distribution with the sum being 1. Otherwise it returns the distribution of raw values.
from_pseudo_doc : bool
    .. versionadded:: 0.12.2

    If True, it returns the topic distribution of its pseudo document. Only valid for `tomotopy.models.PTModel`.'''
        return super().get_topic_dist(normalize, from_pseudo_doc)

현재 문헌의 토픽 확률 분포를 list 형태로 반환합니다.

파라미터

normalize : bool: 추가된 버전: 0.11.0

참일 경우 총합이 1이 되는 확률 분포를 반환하고, 거짓일 경우 정규화되지 않는 값을 그대로 반환합니다.
from_pseudo_doc : bool: 추가된 버전: 0.12.2

참일 경우 가상 문헌의 토픽 분포를 반환합니다. PTModel에서만 유효합니다.

def get_topics(self, top_n=10, from_pseudo_doc=False) ‑> List[Tuple[int, float]]

Expand source code

    def get_topics(self, top_n=10, from_pseudo_doc=False) -> List[Tuple[int, float]]:
        '''Return the `top_n` topics with their probabilities for the document.

Parameters
----------
top_n : int
    the `n` in "top-n"
from_pseudo_doc : bool
    .. versionadded:: 0.12.2

    If True, it returns the topic distribution of its pseudo document. Only valid for `tomotopy.models.PTModel`.'''
        return super().get_topics(top_n, from_pseudo_doc)

현재 문헌의 상위 top_n개의 토픽과 그 확률을 tuple의 list 형태로 반환합니다.

파라미터

top_n : int: "상위-n"에서 n의 값
from_pseudo_doc : bool: 추가된 버전: 0.12.2

참일 경우 가상 문헌의 토픽 분포를 반환합니다. PTModel에서만 유효합니다.

def get_words(self, top_n=10) ‑> List[Tuple[int, float]]

Expand source code

    def get_words(self, top_n=10) -> List[Tuple[int, float]]:
        '''.. versionadded:: 0.4.2

Return the `top_n` words with their probabilities for the document.'''
        return super().get_words(top_n)

추가된 버전: 0.4.2

현재 문헌의 상위 top_n개의 단어와 그 확률을 tuple의 list 형태로 반환합니다.

class SimpleTokenizer (stemmer=None, pattern: str = None, lowercase=True, ngram_list: List[str] | None = None, ngram_delimiter: str = '_')

Expand source code

class SimpleTokenizer:
    '''`SimpleTokenizer` provides a simple word-tokenizing utility with an arbitrary stemmer.'''
    def __init__(self, 
                 stemmer = None, 
                 pattern:str = None, 
                 lowercase = True, 
                 ngram_list:Optional[List[str]] = None,
                 ngram_delimiter:str = '_',
                 ):
        '''Parameters
----------
stemmer : Callable[str, str]
    a callable object for stemming words. If this value is set to `None`, words are not stemmed.
pattern : str
    a regex pattern for extracting tokens
lowercase : bool
    converts the token into lowercase if this is True

Here is an example of using SimpleTokenizer with NLTK for stemming.

.. include:: ./auto_labeling_code_with_porter.rst
'''
        self._pat = re.compile(pattern or r"""[^\s.,;:'"?!<>(){}\[\]\\/`~@#$%^&*|]+""")
        if stemmer and not callable(stemmer):
            raise ValueError("`stemmer` must be callable.")
        self._stemmer = stemmer or None
        self._lowercase = lowercase
        self._ngram_pat = None
        self._ngram_delimiter = ngram_delimiter
        if ngram_list:
            self.build_ngram_pat(ngram_list)

    def build_ngram_pat(self, ngram_list:List[str]):
        ngram_vocab = {}
        patterns = []

        for ngram in ngram_list:
            if self._lowercase:
                ngram = ngram.lower()
            words = self._pat.findall(ngram)
            if len(words) < 2:
                continue
            chrs = []
            for word in words:
                if self._stemmer is not None:
                    word = self._stemmer(word)
                try:
                    wid = ngram_vocab[word]
                except KeyError:
                    wid = chr(len(ngram_vocab) + 256)
                    ngram_vocab[word] = wid
                chrs.append(wid)
            patterns.append(''.join(chrs))
        
        if patterns:
            self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True)))
            self._ngram_vocab = ngram_vocab

    def _tokenize(self, raw:str):
        if self._ngram_pat is None:
            for g in self._pat.finditer(raw):
                start, end = g.span()
                word = g.group()
                if self._lowercase: 
                    word = word.lower()
                if self._stemmer is not None:
                    word = self._stemmer(word)
                yield word, start, end - start
        else:
            all_words = []
            all_spans = []
            chrs = []
            for g in self._pat.finditer(raw):
                all_spans.append(g.span())
                word = g.group()
                if self._lowercase: 
                    word = word.lower()
                if self._stemmer is not None:
                    word = self._stemmer(word)
                all_words.append(word)
                try:
                    chrs.append(self._ngram_vocab[word])
                except KeyError:
                    chrs.append(' ')
            chrs = ''.join(chrs)
            for g in self._ngram_pat.finditer(chrs):
                s, e = g.span()
                is_space = all(raw[ns:ne].isspace() for (_, ns), (ne, _) in zip(all_spans[s:e-1], all_spans[s+1:e]))
                if not is_space:
                    continue
                all_words[s] = self._ngram_delimiter.join(all_words[s:e])
                all_words[s+1:e] = [None] * (e - s - 1)
                all_spans[s] = (all_spans[s][0], all_spans[e-1][1])

            for (s, e), word in zip(all_spans, all_words):
                if word is None: continue
                yield word, s, e - s

    def __call__(self, raw:str, user_data=None):
        is_iterable = False
        # test raw is iterable
        if user_data is None and not isinstance(raw, str):
            try:
                iter(raw)
                is_iterable = True
            except TypeError:
                pass
        if is_iterable:
            for r, _ in raw:
                yield list(self._tokenize(r))
        else:
            yield from self._tokenize(raw)

SimpleTokenizer는 임의의 스테머를 사용할 수 있는 단순한 단어 분리 유틸리티입니다.

파라미터

stemmer : Callable[str, str]: 단어를 스테밍하는데 사용되는 호출가능한 객체. 만약 이 값이 None이라면 스테밍은 사용되지 않습니다.
pattern : str: 토큰을 추출하는데 사용할 정규식 패턴
lowercase : bool: 참일 경우 분리된 단어들을 소문자화합니다.

SimpleTokenizer와 NLTK를 사용하여 스테밍을 하는 예제는 다음과 같습니다.

import tomotopy as tp

# This code requires nltk package for stemming.
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))
corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
    stopwords=lambda x: len(x) <= 2 or x in stopwords)
# data_feeder yields a tuple of (raw string, user data) or a str (raw string)
corpus.process(open(input_file, encoding='utf-8'))

# make LDA model and train
mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
mdl.train(0)
print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
print('Removed top words:', mdl.removed_top_words)
for i in range(0, 1000, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
cands = extractor.extract(mdl)

labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
for k in range(mdl.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in mdl.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

# Example of Results
# -----------------
# == Topic #13 ==
# Labels: weapon systems, weaponry, anti-aircraft, towed, long-range
# aircraft        0.020458335056900978
# use     0.019993379712104797
# airlin  0.012523100711405277
# car     0.012058146297931671
# vehicl  0.01165518444031477
# carrier 0.011531196534633636
# tank    0.011221226304769516
# design  0.010694277472794056
# audi    0.010322313755750656
# martin  0.009981346316635609
# 
# == Topic #17 ==
# Labels: American baseball player, American baseball, American actress, singer-songwriter and guitarist, American actor, director, producer, and screenwriter
# american        0.04471408948302269
# english 0.01746685802936554
# player  0.01714528724551201
# politician      0.014698212035000324
# footbal 0.012313882820308208
# author  0.010909952223300934
# actor   0.008949155919253826
# french  0.007647186517715454
# academ  0.0073020863346755505
# produc  0.006815808825194836
#

파라미터

stemmer : Callable[str, str]: a callable object for stemming words. If this value is set to None, words are not stemmed.
pattern : str: a regex pattern for extracting tokens
lowercase : bool: converts the token into lowercase if this is True

Here is an example of using SimpleTokenizer with NLTK for stemming.

import tomotopy as tp

# This code requires nltk package for stemming.
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))
corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
    stopwords=lambda x: len(x) <= 2 or x in stopwords)
# data_feeder yields a tuple of (raw string, user data) or a str (raw string)
corpus.process(open(input_file, encoding='utf-8'))

# make LDA model and train
mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
mdl.train(0)
print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
print('Removed top words:', mdl.removed_top_words)
for i in range(0, 1000, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
cands = extractor.extract(mdl)

labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
for k in range(mdl.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in mdl.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

# Example of Results
# -----------------
# == Topic #13 ==
# Labels: weapon systems, weaponry, anti-aircraft, towed, long-range
# aircraft        0.020458335056900978
# use     0.019993379712104797
# airlin  0.012523100711405277
# car     0.012058146297931671
# vehicl  0.01165518444031477
# carrier 0.011531196534633636
# tank    0.011221226304769516
# design  0.010694277472794056
# audi    0.010322313755750656
# martin  0.009981346316635609
# 
# == Topic #17 ==
# Labels: American baseball player, American baseball, American actress, singer-songwriter and guitarist, American actor, director, producer, and screenwriter
# american        0.04471408948302269
# english 0.01746685802936554
# player  0.01714528724551201
# politician      0.014698212035000324
# footbal 0.012313882820308208
# author  0.010909952223300934
# actor   0.008949155919253826
# french  0.007647186517715454
# academ  0.0073020863346755505
# produc  0.006815808825194836
#

메소드

def build_ngram_pat(self, ngram_list: List[str])

Expand source code

def build_ngram_pat(self, ngram_list:List[str]):
    ngram_vocab = {}
    patterns = []

    for ngram in ngram_list:
        if self._lowercase:
            ngram = ngram.lower()
        words = self._pat.findall(ngram)
        if len(words) < 2:
            continue
        chrs = []
        for word in words:
            if self._stemmer is not None:
                word = self._stemmer(word)
            try:
                wid = ngram_vocab[word]
            except KeyError:
                wid = chr(len(ngram_vocab) + 256)
                ngram_vocab[word] = wid
            chrs.append(wid)
        patterns.append(''.join(chrs))
    
    if patterns:
        self._ngram_pat = re.compile('|'.join(sorted(patterns, key=lambda x: len(x), reverse=True)))
        self._ngram_vocab = ngram_vocab