diff --git a/docs/cooking_machine/config_parser.html b/docs/cooking_machine/config_parser.html index 84df31b..b56155d 100644 --- a/docs/cooking_machine/config_parser.html +++ b/docs/cooking_machine/config_parser.html @@ -3,16 +3,18 @@ - + Codestin Search App - - - - + + + + + +
@@ -24,8 +26,8 @@

Module topicnet.cooking_machine.config_parser

Parsing text file into Experiment instance using strictyaml (github.com/crdoconnor/strictyaml/)

The aim here is to make config: -* possible to use even for non-programmers -* hard to misuse + possible to use even for non-programmers + hard to misuse * easy debuggable

Hence, the process of parsing config is a bit more complicated than it could be, but it produces more useful error messages. For example:

@@ -1028,7 +1030,7 @@

Returns

-def build_schema_from_function(func: Callable) -> dict +def build_schema_from_function(func: Callable) ‑> dict
@@ -1562,9 +1564,7 @@

Index

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/base_cube.html b/docs/cooking_machine/cubes/base_cube.html index 9c2787a..340bac3 100644 --- a/docs/cooking_machine/cubes/base_cube.html +++ b/docs/cooking_machine/cubes/base_cube.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -291,8 +293,8 @@

Module topicnet.cooking_machine.cubes.base_cubeParameters

# TODO: будет странно работать, если бесконечный список parameter_description = self.get_jsonable_from_parameters() cube_description = { - 'action': self.action, - 'params': parameter_description + 'action': self.action, + 'params': parameter_description } # at one level only one cube can be implemented @@ -918,9 +920,7 @@

-

Generated by pdoc 0.8.1.

+

Generated by pdoc 0.9.0.

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/controller_cube.html b/docs/cooking_machine/cubes/controller_cube.html index 4488146..75167fb 100644 --- a/docs/cooking_machine/cubes/controller_cube.html +++ b/docs/cooking_machine/cubes/controller_cube.html @@ -3,16 +3,18 @@ - + Codestin Search App - - - - + + + + + +
@@ -833,7 +835,7 @@

Methods

-def is_out_of_control(self, values: List[float]) -> OutOfControlAnswer +def is_out_of_control(self, values: List[float]) ‑> OutOfControlAnswer
@@ -849,7 +851,7 @@

Methods

class ControllerAgent -(reg_name: str, tau_converter: Callable, max_iters: int, score_to_track: Union[str, List[str], NoneType] = None, fraction_threshold: Union[float, List[float], NoneType] = None, score_controller: Union[topicnet.cooking_machine.cubes.controller_cube.BaseScoreController, List[topicnet.cooking_machine.cubes.controller_cube.BaseScoreController], NoneType] = None, local_dict: dict = None) +(reg_name: str, tau_converter: Callable, max_iters: int, score_to_track: Union[str, List[str], NoneType] = None, fraction_threshold: Union[float, List[float], NoneType] = None, score_controller: Union[BaseScoreController, List[BaseScoreController], NoneType] = None, local_dict: dict = None)

Allows to change tau during the _fit method.

@@ -1154,7 +1156,7 @@

Parameters

(answer: bool, error_message: Union[str, NoneType] = None)
-

OutOfControlAnswer(answer:bool, error_message:Union[str, NoneType]=None)

+

OutOfControlAnswer(answer: bool, error_message: Union[str, NoneType] = None)

Expand source code @@ -1165,6 +1167,10 @@

Parameters

Class variables

+
var answer : bool
+
+
+
var error_message : Union[str, NoneType]
@@ -1656,6 +1662,7 @@

OutOfControlAnswer

@@ -1678,9 +1685,7 @@

-

Generated by pdoc 0.8.1.

+

Generated by pdoc 0.9.0.

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/cube_creator.html b/docs/cooking_machine/cubes/cube_creator.html index 609589c..578372c 100644 --- a/docs/cooking_machine/cubes/cube_creator.html +++ b/docs/cooking_machine/cubes/cube_creator.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -648,9 +650,7 @@

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/greedy_strategy.html b/docs/cooking_machine/cubes/greedy_strategy.html index 903ef72..b9dc1c5 100644 --- a/docs/cooking_machine/cubes/greedy_strategy.html +++ b/docs/cooking_machine/cubes/greedy_strategy.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -518,9 +520,7 @@

-

Generated by pdoc 0.8.1.

+

Generated by pdoc 0.9.0.

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/index.html b/docs/cooking_machine/cubes/index.html index 8adfcd7..25622ad 100644 --- a/docs/cooking_machine/cubes/index.html +++ b/docs/cooking_machine/cubes/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -19,6 +21,72 @@

Module topicnet.cooking_machine.cubes

+

Cubes and their Strategies

+

Cube types:

+
    +
  • BaseCube — a parent class for all the Cubes
  • +
  • RegularizersModifierCube — cube that adds or alter +model regularizers
  • +
  • CubeCreator — cube that allows to change model +fundamental hyperparameters (topic number)
  • +
  • RegularizationControllerCube - cube that ties together +a complicated usage of RegularizersModifierCube. This cube +allows for change of regularization coefficients across the model +training. This allows to obtain soemwhat unique results by combining +contradictionary restrictions on the model.
  • +
+
+

Strategy types:

+
    +
  • BaseStrategy — a parent class for all the +Strategies
  • +
  • PerplexityStrategy — performs search in given +hyperparameter space until certain score exceeds a boundary
  • +
  • GreedyStrategy — strategy that performes search in +hyperparameter space consequently changing dimensions to perform a 1D +search for a minimum
  • +
+
+

Cube internal structure

+

The main cube attributes:

+
    +
  • parameters — paramteres is an iterable object +containing all the specific information about current cube. The class +architecture implies that parameters should contain an iterable field +describing the hyperparameters search space
  • +
+

Cube methods worth noticing:

+
    +
  • __call__ — performes the cube actions to the model +using provided dataset. Always recieves instance of TopicModel class and +instance of Dataset class. This method does the internal workings of +training models with new hyperparameters. It is responsible for logging +the events (which parameters where changed) happening during the model +training.

  • +
  • apply — method of the cube that prepares model for +further training. This method should be specified by the user as it +contains an “essence” of what is happening at this stage of the +training. It could be new type of model reinitialization, change of the +regualarization coefficient, adding a new level of hierarchy etc. This +function defines what the cube does in the training pipeline.

  • +
  • get_jsonable_from_parameters — is a cube-specific +function that transforms it parameters to dict-like form which later is +written in JSON format log of the experiment.

  • +
+
+

What do you need to +create a new cube?

+

Following this 3 easy steps you will be able to write down your own +cube:

+
    +
  1. Inherit your Cube from BaseCube.

  2. +
  3. Child class should define following methods +__init__, apply, +get_jsonable_from_parameters. It is strongly descouraged to +change __call__ method.

  4. +
  5. get_jsonable_from_parameters()[i] corresponds to the +same cube step as parameters[i].

  6. +
@@ -75,7 +143,7 @@

Sub-modules

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/perplexity_strategy.html b/docs/cooking_machine/cubes/perplexity_strategy.html index 5e10a12..d9b1c26 100644 --- a/docs/cooking_machine/cubes/perplexity_strategy.html +++ b/docs/cooking_machine/cubes/perplexity_strategy.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -669,9 +671,7 @@

-

Generated by pdoc 0.8.1.

+

Generated by pdoc 0.9.0.

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/regularizer_cube.html b/docs/cooking_machine/cubes/regularizer_cube.html index 30a94fa..773d350 100644 --- a/docs/cooking_machine/cubes/regularizer_cube.html +++ b/docs/cooking_machine/cubes/regularizer_cube.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -600,9 +602,7 @@

-

Generated by pdoc 0.8.1.

+

Generated by pdoc 0.9.0.

- - \ No newline at end of file diff --git a/docs/cooking_machine/cubes/strategy.html b/docs/cooking_machine/cubes/strategy.html index 884f293..69ef6f5 100644 --- a/docs/cooking_machine/cubes/strategy.html +++ b/docs/cooking_machine/cubes/strategy.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -399,9 +401,7 @@

-

Generated by pdoc 0.8.1.

+

Generated by pdoc 0.9.0.

- - \ No newline at end of file diff --git a/docs/cooking_machine/dataset.html b/docs/cooking_machine/dataset.html index 23b26cf..0aace40 100644 --- a/docs/cooking_machine/dataset.html +++ b/docs/cooking_machine/dataset.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
@@ -29,6 +31,7 @@

Module topicnet.cooking_machine.dataset

import pandas as pd import shutil import sys +import uuid import warnings from glob import glob @@ -36,6 +39,8 @@

Module topicnet.cooking_machine.dataset

List, Optional, ) +from collections import Counter + import artm @@ -51,6 +56,8 @@

Module topicnet.cooking_machine.dataset

DEFAULT_ARTM_MODALITY = '@default_class' # TODO: how to get this value from artm library? MODALITY_START_SYMBOL = '|' +NONEXISTENT_SEP = str(uuid.uuid4()) # To read vw as one-column csv + def _increase_csv_field_max_size(): """Makes document entry in dataset as big as possible @@ -123,6 +130,20 @@

Module topicnet.cooking_machine.dataset

return "" +def dataset2counter(dataset): + result = {} + for i, row in dataset._data.iterrows(): + doc_id, *text_info = row['vw_text'].split('|@') + doc_id = doc_id.strip() + result[doc_id] = Counter() + # TODO: use get_content_of_modalty here + vw_line = text_info[0] + for token_with_counter in vw_line.split()[1:]: + token, _, counter = token_with_counter.partition(':') + result[doc_id][token] += int(counter or '1') + return result + + class BaseDataset: """ """ def get_source_document(self, document_id): @@ -364,7 +385,7 @@

Module topicnet.cooking_machine.dataset

data = data_handle.read_csv( data_path, engine='python', - error_bad_lines=False, + on_bad_lines='warn', ) elif file_type == '.pkl': @@ -372,7 +393,7 @@

Module topicnet.cooking_machine.dataset

data = data_handle.read_pickle( data_path, engine='python', - error_bad_lines=False, + on_bad_lines='warn', ) except AttributeError: raise RuntimeError('Can\'t handle big *.pkl files!') @@ -381,8 +402,8 @@

Module topicnet.cooking_machine.dataset

data = data_handle.read_csv( data_path, engine='python', - error_bad_lines=False, - sep='\n', + on_bad_lines='warn', + sep=NONEXISTENT_SEP, header=None, names=[VW_TEXT_COL] ) @@ -424,7 +445,7 @@

Module topicnet.cooking_machine.dataset

Another Parameters ------------------ **kwargs - *kwargs* are optional init `topicnet.Dataset` parameters + *kwargs* are optional init parameters """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) @@ -741,6 +762,29 @@

Module topicnet.cooking_machine.dataset

Functions

+
+def dataset2counter(dataset) +
+
+
+
+ +Expand source code + +
def dataset2counter(dataset):
+    result = {}
+    for i, row in dataset._data.iterrows():
+        doc_id, *text_info = row['vw_text'].split('|@')
+        doc_id = doc_id.strip()
+        result[doc_id] = Counter()
+        # TODO: use get_content_of_modalty here
+        vw_line = text_info[0]
+        for token_with_counter in vw_line.split()[1:]:
+            token, _, counter = token_with_counter.partition(':')
+            result[doc_id][token] += int(counter or '1')
+    return result
+
+
def get_modality_names(vw_string)
@@ -1202,7 +1246,7 @@

Notes

data = data_handle.read_csv( data_path, engine='python', - error_bad_lines=False, + on_bad_lines='warn', ) elif file_type == '.pkl': @@ -1210,7 +1254,7 @@

Notes

data = data_handle.read_pickle( data_path, engine='python', - error_bad_lines=False, + on_bad_lines='warn', ) except AttributeError: raise RuntimeError('Can\'t handle big *.pkl files!') @@ -1219,8 +1263,8 @@

Notes

data = data_handle.read_csv( data_path, engine='python', - error_bad_lines=False, - sep='\n', + on_bad_lines='warn', + sep=NONEXISTENT_SEP, header=None, names=[VW_TEXT_COL] ) @@ -1262,7 +1306,7 @@

Notes

Another Parameters ------------------ **kwargs - *kwargs* are optional init `topicnet.Dataset` parameters + *kwargs* are optional init parameters """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) @@ -1582,7 +1626,7 @@

Subclasses

Static methods

-def from_dataframe(dataframe: pandas.core.frame.DataFrame, save_dataset_path: str, dataframe_name: str = 'dataset', **kwargs) -> Dataset +def from_dataframe(dataframe: pandas.core.frame.DataFrame, save_dataset_path: str, dataframe_name: str = 'dataset', **kwargs) ‑> Dataset

Creates dataset from pd.DataFrame @@ -1599,7 +1643,7 @@

Parameters

Another Parameters


kwargs -kwargs are optional init topicnet.Dataset parameters

+kwargs are optional init parameters

Expand source code @@ -1627,7 +1671,7 @@

Parameters

Another Parameters ------------------ **kwargs - *kwargs* are optional init `topicnet.Dataset` parameters + *kwargs* are optional init parameters """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) @@ -1695,7 +1739,7 @@

Methods

-def get_batch_vectorizer(self) -> artm.batches_utils.BatchVectorizer +def get_batch_vectorizer(self) ‑> artm.batches_utils.BatchVectorizer

Gets batch vectorizer.

@@ -1766,7 +1810,7 @@

Returns

-def get_dictionary(self) -> artm.dictionary.Dictionary +def get_dictionary(self) ‑> artm.dictionary.Dictionary

Gets dataset's dictionary.

@@ -1844,7 +1888,7 @@

Returns

-def get_source_document(self, document_id: str) -> pandas.core.frame.DataFrame +def get_source_document(self, document_id: str) ‑> pandas.core.frame.DataFrame

Get 'raw_text' for the document with document_id.

@@ -1909,7 +1953,7 @@

Returns

-def get_vw_document(self, document_id: str) -> pandas.core.frame.DataFrame +def get_vw_document(self, document_id: str) ‑> pandas.core.frame.DataFrame

Get 'vw_text' for the document with document_id.

@@ -1974,7 +2018,7 @@

Returns

-def write_vw(self, file_path: str) -> NoneType +def write_vw(self, file_path: str) ‑> NoneType

Saves dataset as text file in Vowpal Wabbit format

@@ -2030,6 +2074,7 @@

Index

  • Functions

    @@ -2064,9 +2109,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/dataset_cooc.html b/docs/cooking_machine/dataset_cooc.html index 0003134..20e1686 100644 --- a/docs/cooking_machine/dataset_cooc.html +++ b/docs/cooking_machine/dataset_cooc.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -761,9 +763,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/experiment.html b/docs/cooking_machine/experiment.html index 298e57d..bd318fa 100644 --- a/docs/cooking_machine/experiment.html +++ b/docs/cooking_machine/experiment.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -2862,9 +2864,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/index.html b/docs/cooking_machine/index.html index 0ca01e5..2428b0d 100644 --- a/docs/cooking_machine/index.html +++ b/docs/cooking_machine/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -19,6 +21,42 @@

    Module topicnet.cooking_machine

    +

    Cooking Machine

    +

    Cube

    +

    A unit of model training pipeline. This unit encapsulates an action +over one or many model hyperparameters. This action and hyperparameter +space are stored as cube properties and then saved in Experiment.

    +

    Input: model or list of models, regularizer or list +of them, hyperparameter search sapce(grid), iterations number or a +function defining it, custom metrics.
    +Output: models.
    +Body: performs actions over artm model. +Can modify, create new models and alter their Experiment.

    +

    Model

    +

    A class containing Topic Model and its description:

    +
      +
    • stores topic model description;
    • +
    • outputs the description in human-readable form;
    • +
    • the model can only load and copy itself, the artm-model is an +attribute and in order to change it is should be extracted, modified and +put back;
    • +
    • stores experiment id;
    • +
    • stores parent model id;
    • +
    • stores model topic names;
    • +
    • stores regularizers list with their parameters;
    • +
    • stores modality weights;
    • +
    • stores save path for data, model and model information;
    • +
    • stores training metric values.
    • +
    +

    Experiment

    +

    Class providing experiment infrastructure:

    +
      +
    • keeps the description of all actions on the models;
    • +
    • provides human-readable log of experiment;
    • +
    • keeps the model training sequence in memory;
    • +
    • automaticly runs integrity check;
    • +
    • able to copy itself.
    • +
    @@ -93,7 +131,7 @@

    Sub-modules

    - - \ No newline at end of file diff --git a/docs/cooking_machine/model_constructor.html b/docs/cooking_machine/model_constructor.html index 5699f8b..eb64305 100644 --- a/docs/cooking_machine/model_constructor.html +++ b/docs/cooking_machine/model_constructor.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -246,7 +248,7 @@

    Module topicnet.cooking_machine.model_constructor

    Functions

    -def add_standard_scores(model: artm.artm_model.ARTM, dictionary: artm.dictionary.Dictionary = None, main_modality: str = '@lemmatized', all_modalities: List[str] = ('@lemmatized', '@ngramms')) -> NoneType +def add_standard_scores(model: artm.artm_model.ARTM, dictionary: artm.dictionary.Dictionary = None, main_modality: str = '@lemmatized', all_modalities: List[str] = ('@lemmatized', '@ngramms')) ‑> NoneType

    Adds standard scores for the model.

    @@ -409,7 +411,7 @@

    Returns

    -def init_simple_default_model(dataset: Dataset, modalities_to_use: List[str], main_modality: str, specific_topics: List[str], background_topics: List[str]) -> artm.artm_model.ARTM +def init_simple_default_model(dataset: Dataset, modalities_to_use: List[str], main_modality: str, specific_topics: List[str], background_topics: List[str]) ‑> artm.artm_model.ARTM

    Creates simple artm.ARTM model with standard scores.

    @@ -554,9 +556,7 @@

    Index

    - - \ No newline at end of file diff --git a/docs/cooking_machine/model_tracking.html b/docs/cooking_machine/model_tracking.html index eb64232..2469ddb 100644 --- a/docs/cooking_machine/model_tracking.html +++ b/docs/cooking_machine/model_tracking.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -945,9 +947,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/base_model.html b/docs/cooking_machine/models/base_model.html index 9467dc1..319a4c7 100644 --- a/docs/cooking_machine/models/base_model.html +++ b/docs/cooking_machine/models/base_model.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -106,7 +108,7 @@

    Module topicnet.cooking_machine.models.base_model else: experiment_id = None - return f'Model(id={self.model_id}, ' \ + return f'{self.__class__.__name__}(id={self.model_id}, ' \ f'parent_id={self.parent_model_id}, ' \ f'experiment_id={experiment_id}' \ f')' @@ -266,16 +268,20 @@

    Module topicnet.cooking_machine.models.base_model def _get_short_scores(self): short_scores = {} + # sometimes self.scores could be None for score_name in self.scores or {}: values = self.scores[score_name] + if len(values) == 0: short_scores[score_name] = [] continue - if isinstance(values[0], Number): - short_scores[score_name] = values[-1:] - else: - short_scores[score_name] = [f"NaN ({type(values[0])})"] + + short_scores[score_name] = [ + v if isinstance(v, Number) else f"NaN ({type(v)})" + for v in values + ] + return short_scores @property @@ -452,7 +458,7 @@

    Parameters

    else: experiment_id = None - return f'Model(id={self.model_id}, ' \ + return f'{self.__class__.__name__}(id={self.model_id}, ' \ f'parent_id={self.parent_model_id}, ' \ f'experiment_id={experiment_id}' \ f')' @@ -612,16 +618,20 @@

    Parameters

    def _get_short_scores(self): short_scores = {} + # sometimes self.scores could be None for score_name in self.scores or {}: values = self.scores[score_name] + if len(values) == 0: short_scores[score_name] = [] continue - if isinstance(values[0], Number): - short_scores[score_name] = values[-1:] - else: - short_scores[score_name] = [f"NaN ({type(values[0])})"] + + short_scores[score_name] = [ + v if isinstance(v, Number) else f"NaN ({type(v)})" + for v in values + ] + return short_scores @property @@ -1085,9 +1095,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/base_regularizer.html b/docs/cooking_machine/models/base_regularizer.html index 2283130..0bb09c0 100644 --- a/docs/cooking_machine/models/base_regularizer.html +++ b/docs/cooking_machine/models/base_regularizer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -166,9 +168,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/base_score.html b/docs/cooking_machine/models/base_score.html index ebdb7b3..e2988ab 100644 --- a/docs/cooking_machine/models/base_score.html +++ b/docs/cooking_machine/models/base_score.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -280,18 +282,18 @@

    Examples

    Scores created below are unworkable (as BaseScore has no call method inplemented). These are just the examples of how one can create a score and set some of its parameters.

    Scores to be computed on every iteration:

    -
    >>> score = BaseScore()
    +
    >>> score = BaseScore()
     >>> score = BaseScore(should_compute=BaseScore.compute_always)
     >>> score = BaseScore(should_compute=lambda i: True)
     >>> score = BaseScore(should_compute=True)
     

    Scores to be computed only on the last iteration:

    -
    >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
    +
    >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
     >>> score = BaseScore(should_compute=lambda i: False)
     >>> score = BaseScore(should_compute=False)
     

    Score to be computed only on even iterations:

    -
    >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
    +
    >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
     
    @@ -504,7 +506,7 @@

    Subclasses

    Static methods

    -def compute_always(fit_iteration: int) -> bool +def compute_always(fit_iteration: int) ‑> bool
    @@ -518,7 +520,7 @@

    Static methods

  • -def compute_on_last(fit_iteration: int) -> bool +def compute_on_last(fit_iteration: int) ‑> bool
    @@ -583,7 +585,7 @@

    Notes

    in its successor score classes.

    Examples

    Score which uses precomputed_data:

    -
    >>> import time
    +
    >>> import time
     ...
     >>> class NewScore(BaseScore):
     ...     def __init__(self, name: str, multiplier: float):
    @@ -771,9 +773,7 @@ 

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/blei_lafferty_score.html b/docs/cooking_machine/models/blei_lafferty_score.html index b2c1dde..69a9c66 100644 --- a/docs/cooking_machine/models/blei_lafferty_score.html +++ b/docs/cooking_machine/models/blei_lafferty_score.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -83,7 +85,7 @@

    Module topicnet.cooking_machine.models.blei_lafferty_sco blei_eps = 1e-42 log_phi = np.log(phi + blei_eps) numerator = np.sum(log_phi, axis=1) - numerator = numerator[:, np.newaxis] + numerator = numerator.to_numpy()[:, np.newaxis] if hasattr(log_phi, "values"): multiplier = log_phi.values - numerator / topic_number @@ -191,7 +193,7 @@

    Parameters

    blei_eps = 1e-42 log_phi = np.log(phi + blei_eps) numerator = np.sum(log_phi, axis=1) - numerator = numerator[:, np.newaxis] + numerator = numerator.to_numpy()[:, np.newaxis] if hasattr(log_phi, "values"): multiplier = log_phi.values - numerator / topic_number @@ -254,9 +256,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/dummy_topic_model.html b/docs/cooking_machine/models/dummy_topic_model.html index 41aedb1..1a04130 100644 --- a/docs/cooking_machine/models/dummy_topic_model.html +++ b/docs/cooking_machine/models/dummy_topic_model.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -874,9 +876,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/example_score.html b/docs/cooking_machine/models/example_score.html index 22f2653..adbd1d5 100644 --- a/docs/cooking_machine/models/example_score.html +++ b/docs/cooking_machine/models/example_score.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -236,9 +238,7 @@

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/frozen_score.html b/docs/cooking_machine/models/frozen_score.html index 2d424ba..b89c1ab 100644 --- a/docs/cooking_machine/models/frozen_score.html +++ b/docs/cooking_machine/models/frozen_score.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -109,7 +111,9 @@

    Module topicnet.cooking_machine.models.frozen_score

    + pass + + self._name = self._original_score._name
    @@ -163,18 +167,18 @@

    Examples

    Scores created below are unworkable (as BaseScore has no call method inplemented). These are just the examples of how one can create a score and set some of its parameters.

    Scores to be computed on every iteration:

    -
    >>> score = BaseScore()
    +
    >>> score = BaseScore()
     >>> score = BaseScore(should_compute=BaseScore.compute_always)
     >>> score = BaseScore(should_compute=lambda i: True)
     >>> score = BaseScore(should_compute=True)
     

    Scores to be computed only on the last iteration:

    -
    >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
    +
    >>> score = BaseScore(should_compute=BaseScore.compute_on_last)
     >>> score = BaseScore(should_compute=lambda i: False)
     >>> score = BaseScore(should_compute=False)
     

    Score to be computed only on even iterations:

    -
    >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
    +
    >>> score = BaseScore(should_compute=lambda i: i % 2 == 0)
     
    @@ -253,7 +257,9 @@

    Examples

    setattr(self._original_score, field_name, field_value) except AttributeError: # TODO: log? - pass
    + pass + + self._name = self._original_score._name

    Ancestors

      @@ -262,7 +268,7 @@

      Ancestors

      Methods

      -def update(self, score_value: float) -> NoneType +def update(self, score_value: float) ‑> NoneType

      Update is not supposed to be applied to Frozen score. @@ -335,9 +341,7 @@

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/index.html b/docs/cooking_machine/models/index.html index 6d87e37..974d285 100644 --- a/docs/cooking_machine/models/index.html +++ b/docs/cooking_machine/models/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -19,6 +21,91 @@

    Module topicnet.cooking_machine.models

    +

    Models and scores

    +

    Availiable models:

    +
      +
    • BaseModel — Parent class for model creation
    • +
    • TopicModel — a wrapper class for bigartm topic +model
    • +
    • DummyTopicModel — a fake model that contains training +information but not actual artm model. Needed to save memory space +during the training.
    • +
    +
    +

    Availiable scores:

    +
      +
    • BaseScore — a parent class for all the Strategies
    • +
    • ExampleScore — Example of minimal working example of +custom score
    • +
    • IntratextCoherenceScore — score that calculates +coherence as a measure of interpretability of the model using raw +documents from dataset. Calculation-heavy score. Recommended to be used +after model training
    • +
    • BleiLaffertyScore — An experimental light-weight score +to estimate interpretability of the topics
    • +
    • SemanticRadiusScore — An experimental score reflecting +whether collection is adequately described by topics. Lower if better. +Calculation-heavy score.
    • +
    +
    +

    Internal model structure

    +

    main model attributes:

    +
      +
    • model_id — a model string id, unique for its +Experiment.

    • +
    • scores — dict of lists, each list corresponds to the +score value or list of values at certain training stage.

    • +
    • custom_scores — variable providing custom scores for +the model

    • +
    • custom_regularizers — variable providing custom +regularizers for the model. An example is provided in +topic_prior_regularizer.py.

    • +
    +

    main model methods:

    +
      +
    • _fit — function performing model training. Takes the +dataset and number of iterations. Optionally, you can pass +custom_regularizers here, if you wish to apply them to a +single iteration.

      +

      Important Notice! We assume that the model training +happens through Cube interface and this method, while important should +never be used by users if they are hope to have their actions +logged

    • +
    • get_phi — function that returns +p(token|topic/cluster) probability distributions that returns +pandas.DataFrame with tokens as index and topics/clusters as columns

      +

      Important Notice! Strictly speaking the function +returns degree to which token belongs to the topic/cluster and shouldn’t +be a probability distribution. But scince its main use-case intended for +topic models some of the functions using this method might work +incorrectly in non-distribution case

    • +
    • get_theta — function that returns +p(topic/cluster|document) probability distributions that returns +pandas.DataFrame with topics/clusters as index and document ids as +columns.

      +

      Important Notice! Strictly speaking the function +returns degree to which document belongs to the topic/cluster and +shouldn’t be a probability distribution. But scince its main use-case +intended for topic models some of the functions using this method might +work incorrectly in non-distribution case

    • +
    • save — saves model to the path directory.

    • +
    • load — loads model from the path directory

    • +
    • clone — creates copy of a model.

    • +
    • get_jsonable_from_parameters — turns model +parameters to jsonable format for logging purposes

    • +
    +
    +

    What do you need to +create your own model?

    +

    Following this steps you should be able to code a model integrated +with the library methods:

    +
      +
    1. New model class is inherrited from BaseModel

    2. +
    3. A child class should contain methods __init__, +_fit, get_phi, get_theta, +save, load, clone, +get_jsonable_from_parameters.

    4. +
    @@ -106,7 +193,7 @@

    Sub-modules

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/intratext_coherence_score.html b/docs/cooking_machine/models/intratext_coherence_score.html index 2c0e902..b957162 100644 --- a/docs/cooking_machine/models/intratext_coherence_score.html +++ b/docs/cooking_machine/models/intratext_coherence_score.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -806,7 +808,7 @@

    Class variables

    class IntratextCoherenceScore -(dataset: Union[topicnet.cooking_machine.dataset.Dataset, str], name: str = None, should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, documents_fraction: float = 1.0, text_type: TextType = TextType.VW_TEXT, computation_method: ComputationMethod = ComputationMethod.SEGMENT_WEIGHT, word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 20, start_fit_iteration: int = 0, fit_iteration_step: int = 1, seed: int = 11221963, verbose: bool = False) +(dataset: Union[Dataset, str], name: str = None, should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, documents_fraction: float = 1.0, text_type: TextType = TextType.VW_TEXT, computation_method: ComputationMethod = ComputationMethod.SEGMENT_WEIGHT, word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 20, start_fit_iteration: int = 0, fit_iteration_step: int = 1, seed: int = 11221963, verbose: bool = False)

    Computes intratext coherence

    @@ -872,7 +874,7 @@

    Notes

    at the end of the training process (and not in the dependence of score on iteration), one should adjust start_fit_iteration and fit_iteration_step correspondingly. For example:

    -
    >>> # dataset = Dataset(...)
    +
    >>> # dataset = Dataset(...)
     >>> # topic_model = TopicModel(...)
     >>> num_iterations = 100
     >>> topic_model.custom_scores['intratext_coherence'] = IntratextCoherenceScore(
    @@ -1568,7 +1570,7 @@ 

    Instance variables

    Methods

    -def compute(self, model: BaseModel, topics: List[str] = None, documents: List[str] = None) -> Dict[str, Union[float, NoneType]] +def compute(self, model: BaseModel, topics: List[str] = None, documents: List[str] = None) ‑> Dict[str, Union[float, NoneType]]
    @@ -1639,7 +1641,7 @@

    Methods

    -def save(self, path: str) -> NoneType +def save(self, path: str) ‑> NoneType
    @@ -1874,9 +1876,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/scores.html b/docs/cooking_machine/models/scores.html index 016e55a..22268a1 100644 --- a/docs/cooking_machine/models/scores.html +++ b/docs/cooking_machine/models/scores.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -115,7 +117,7 @@

    Parameters

    blei_eps = 1e-42 log_phi = np.log(phi + blei_eps) numerator = np.sum(log_phi, axis=1) - numerator = numerator[:, np.newaxis] + numerator = numerator.to_numpy()[:, np.newaxis] if hasattr(log_phi, "values"): multiplier = log_phi.values - numerator / topic_number @@ -155,7 +157,7 @@

    Inherited members

    class IntratextCoherenceScore -(dataset: Union[topicnet.cooking_machine.dataset.Dataset, str], name: str = None, should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, documents_fraction: float = 1.0, text_type: TextType = TextType.VW_TEXT, computation_method: ComputationMethod = ComputationMethod.SEGMENT_WEIGHT, word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 20, start_fit_iteration: int = 0, fit_iteration_step: int = 1, seed: int = 11221963, verbose: bool = False) +(dataset: Union[Dataset, str], name: str = None, should_compute: Callable[[int], bool] = None, keep_dataset_in_memory: bool = None, keep_dataset: bool = True, documents: List[str] = None, documents_fraction: float = 1.0, text_type: TextType = TextType.VW_TEXT, computation_method: ComputationMethod = ComputationMethod.SEGMENT_WEIGHT, word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 20, start_fit_iteration: int = 0, fit_iteration_step: int = 1, seed: int = 11221963, verbose: bool = False)

    Computes intratext coherence

    @@ -221,7 +223,7 @@

    Notes

    at the end of the training process (and not in the dependence of score on iteration), one should adjust start_fit_iteration and fit_iteration_step correspondingly. For example:

    -
    >>> # dataset = Dataset(...)
    +
    >>> # dataset = Dataset(...)
     >>> # topic_model = TopicModel(...)
     >>> num_iterations = 100
     >>> topic_model.custom_scores['intratext_coherence'] = IntratextCoherenceScore(
    @@ -917,7 +919,7 @@ 

    Instance variables

    Methods

    -def compute(self, model: BaseModel, topics: List[str] = None, documents: List[str] = None) -> Dict[str, Union[float, NoneType]] +def compute(self, model: BaseModel, topics: List[str] = None, documents: List[str] = None) ‑> Dict[str, Union[float, NoneType]]
    @@ -988,7 +990,7 @@

    Methods

    -def save(self, path: str) -> NoneType +def save(self, path: str) ‑> NoneType
    @@ -1352,9 +1354,7 @@

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/scores_wrapper.html b/docs/cooking_machine/models/scores_wrapper.html index bb0418f..d2a89e6 100644 --- a/docs/cooking_machine/models/scores_wrapper.html +++ b/docs/cooking_machine/models/scores_wrapper.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -26,8 +28,9 @@

    Module topicnet.cooking_machine.models.scores_wrapper
    import artm
     import copy
    -from collections.abc import Mapping
    +import warnings
     
    +from collections.abc import Mapping
     from typing import (
         Dict,
         List,
    @@ -78,15 +81,20 @@ 

    Module topicnet.cooking_machine.models.scores_wrapperClasses

    class ScoresWrapper -(topicnet_scores: Dict[str, topicnet.cooking_machine.models.base_score.BaseScore], artm_scores: artm.scores.Scores) +(topicnet_scores: Dict[str, BaseScore], artm_scores: artm.scores.Scores)
    @@ -167,15 +175,20 @@

    Classes

    return key def add(self, score: Union[BaseScore, artm.scores.BaseScore]): - if isinstance(score, FrozenScore): - raise TypeError('FrozenScore is not supposed to be added to model') + if isinstance(score, BaseScore): + if isinstance(score, FrozenScore): + warnings.warn( + f'Adding FrozenScore "{score._name}" to model.' + f' It will not be used in computations!' + f' If this is not the expected behaviour,' + f' then perhaps the fact is that the score was not saved correctly.' + ) - elif isinstance(score, BaseScore): if score._name is None: raise ValueError( 'When using `model.scores.add(score)` method,' - ' one should specify score name parameter during score initialization.' - ' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))' + ' one should specify score `name` parameter during score initialization.' + ' For example, `model.scores.add(IntratextCoherenceScore(name="name", ...))' ) self._topicnet_scores[score._name] = score @@ -208,7 +221,7 @@

    Ancestors

    Methods

    -def add(self, score: Union[topicnet.cooking_machine.models.base_score.BaseScore, artm.scores.BaseScore]) +def add(self, score: Union[BaseScore, artm.scores.BaseScore])
    @@ -217,15 +230,20 @@

    Methods

    Expand source code
    def add(self, score: Union[BaseScore, artm.scores.BaseScore]):
    -    if isinstance(score, FrozenScore):
    -        raise TypeError('FrozenScore is not supposed to be added to model')
    +    if isinstance(score, BaseScore):
    +        if isinstance(score, FrozenScore):
    +            warnings.warn(
    +                f'Adding FrozenScore "{score._name}" to model.'
    +                f' It will not be used in computations!'
    +                f' If this is not the expected behaviour,'
    +                f' then perhaps the fact is that the score was not saved correctly.'
    +            )
     
    -    elif isinstance(score, BaseScore):
             if score._name is None:
                 raise ValueError(
                     'When using `model.scores.add(score)` method,'
    -                ' one should specify score name parameter during score initialization.'
    -                ' For example `model.scores.add(IntratextCoherenceScore(name="name", ...))'
    +                ' one should specify score `name` parameter during score initialization.'
    +                ' For example, `model.scores.add(IntratextCoherenceScore(name="name", ...))'
                 )
     
             self._topicnet_scores[score._name] = score
    @@ -272,9 +290,7 @@ 

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/semantic_radius_score.html b/docs/cooking_machine/models/semantic_radius_score.html index e4a14ba..8f3069b 100644 --- a/docs/cooking_machine/models/semantic_radius_score.html +++ b/docs/cooking_machine/models/semantic_radius_score.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -654,9 +656,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/thetaless_regularizer.html b/docs/cooking_machine/models/thetaless_regularizer.html index 9288ffd..7e6f924 100644 --- a/docs/cooking_machine/models/thetaless_regularizer.html +++ b/docs/cooking_machine/models/thetaless_regularizer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -53,8 +55,9 @@

    Module topicnet.cooking_machine.models.thetaless_regular """ dictionary_data = artm_dict._master.get_dictionary(artm_dict._name) - dict_pandas = {field: getattr(dictionary_data, field) + dict_pandas = {field: list(getattr(dictionary_data, field)) for field in FIELDS} + return pd.DataFrame(dict_pandas) # ================================== @@ -81,9 +84,9 @@

    Module topicnet.cooking_machine.models.thetaless_regular return df_inverted_index.to_dict()['index'] -def dataset2sparse_matrix(dataset, modality, modalities_to_use=None): +def dataset2sparse_matrix(dataset, modality, modalities_to_use=None, remove_nans=True): """ - Builds a sparse matrix from batch_vectorizer linked to the Dataset + Builds a sparse matrix from batch_vectorizer linked to the Dataset. If you need an inverse mapping: @@ -95,7 +98,7 @@

    Module topicnet.cooking_machine.models.thetaless_regular dataset: Dataset modality: str the remaining modalities will be ignored - (their occurrences will be replaced with zeros, but they will continue to exist) + (their occurrences will be replaced with zeros, but they will continue to exist). modalities_to_use: iterable a set of modalities the underlying topic model is using (this is about topic model, not regularizer; this parameter ensures that the shapes of n_dw matrix and actual @@ -110,27 +113,30 @@

    Module topicnet.cooking_machine.models.thetaless_regular If you hadn't explicitly listed any modalities yet, you probably could leave this argument as None. - If you use a single modality, wrap it into a list (e.g.['@word']) + If you use a single modality, wrap it into a list (e.g.['@word']). + remove_nans: bool + whether to re-encode values to transform NaNs in n_dw matrix to explicitly stored zeros. Returns ------- - n_dw_matrix: scipy.sparse.csr_matrix - The matrix of document-word occurrences. - `n_dw` is a number of the occurrences of the word `w` in the document `d` - this matrix determines the dependence between the Theta and Phi matrices - (Phi is the result of one iteration of the ARTM's EM algorihtm - with uniform theta initialization and `n_dw` matrix of the document-word occurrences) + n_dw_matrix: scipy.sparse.csr_matrix + the matrix of document-word occurrences + (`n_dw` is a number of the occurrences of the word `w` in the document `d`.) + This matrix determines the dependence between the Theta and Phi matrices + (Phi is the result of one iteration of the ARTM's EM algorihtm + with uniform Theta initialization and `n_dw` matrix of the document-word occurrences). + """ # noqa: W291 token2id = obtain_token2id(dataset) batch_vectorizer = dataset.get_batch_vectorizer() return _batch_vectorizer2sparse_matrix( - batch_vectorizer, token2id, modality, modalities_to_use + batch_vectorizer, token2id, modality, modalities_to_use, remove_nans ) -def _batch_vectorizer2sparse_matrix(batch_vectorizer, token2id, modality, modalities_to_use=None): +def _batch_vectorizer2sparse_matrix(batch_vectorizer, token2id, modality, modalities_to_use=None, remove_nans=True): """ """ theta_column_naming = 'id' # scipy sparse matrix doesn't support non-integer indices @@ -153,6 +159,7 @@

    Module topicnet.cooking_machine.models.thetaless_regular # probably dictionary was filtered continue if modalities_to_use and token_class_id not in modalities_to_use: + # skip foreign modality continue if token_class_id != modality: # we still need these tokens, @@ -173,11 +180,12 @@

    Module topicnet.cooking_machine.models.thetaless_regular # this is needed to be in sync with artm dictionary after filtering elements out # (they need to have the same shape) ind = sparse_n_dw_matrix.sum(axis=0) - nonzeros = np.ravel(ind > 0) + nonzeros = np.ravel((ind > 0) | (ind != ind)) # also includes NaN-s sparse_n_dw_matrix = sparse_n_dw_matrix[:, nonzeros] - # re-encode values to transform NaNs to explicitly stored zeros - sparse_n_dw_matrix.data = np.nan_to_num(sparse_n_dw_matrix.data) + if remove_nans: + # re-encode values to transform NaNs to explicitly stored zeros + sparse_n_dw_matrix.data = np.nan_to_num(sparse_n_dw_matrix.data) return sparse_n_dw_matrix @@ -288,7 +296,7 @@

    Module topicnet.cooking_machine.models.thetaless_regular class ThetalessRegularizer(BaseRegularizer): - def __init__(self, name, tau, modality, dataset: Dataset): + def __init__(self, name, tau, modality, dataset: Dataset, modalities_to_use=None): """ A regularizer based on a "thetaless" topic model inference @@ -298,20 +306,36 @@

    Module topicnet.cooking_machine.models.thetaless_regular Parameters ---------- name: str - name of the regularizer + name of the regularizer. tau: Number according to the math, `tau` should be set to 1 (to correctly emulate a different inference process). But you do you, it's not like there's a regularizer police or something. modality: str - name of modality on which the inference should be based - dataset - will be transformed to n_dw_matrix + name of modality on which the inference should be based. + dataset: Dataset + will be transformed to n_dw_matrix. + modalities_to_use: iterable + a set of modalities the underlying topic model is using (this is about topic model, + not regularizer; this parameter ensures that the shapes of n_dw matrix and actual + Phi matrix match). + + The tokens outside of this list will be discarded utterly + (the resulting matrix will have no entries corresponding to them) + + For artm.ARTM() models, you need to pass whatever is inside class_ids; + while TopicModel usually requires this to be set inside modalities_to_use. + + If you hadn't explicitly listed any modalities yet, you probably could + leave this argument as None. + + If you use a single modality, wrap it into a list (e.g.['@word']). + """ # noqa: W291 super().__init__(name, tau) self.modality = modality - self.modalities_to_use = None + self.modalities_to_use = modalities_to_use self.n_dw_matrix = None self.token2id = obtain_token2id(dataset) @@ -319,8 +343,14 @@

    Module topicnet.cooking_machine.models.thetaless_regular def _initialize_matrices(self, batch_vectorizer, token2id): self.n_dw_matrix = _batch_vectorizer2sparse_matrix( - batch_vectorizer, token2id, self.modality, self.modalities_to_use + batch_vectorizer, token2id, + self.modality, self.modalities_to_use, + remove_nans=False, ) + ind = self.n_dw_matrix.sum(axis=0) + self.modalities_mask = np.ravel((ind == ind)) # detects not-NaN-s + self.n_dw_matrix.data = np.nan_to_num(self.n_dw_matrix.data) + self.B = scipy.sparse.csr_matrix( ( 1. * self.n_dw_matrix.data / calc_docsizes(self.n_dw_matrix), @@ -362,7 +392,10 @@

    Module topicnet.cooking_machine.models.thetaless_regular tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return self.tau * (n_tw.T - nwt) + result = n_tw.T - nwt + result = (result.T * self.modalities_mask).T + + return self.tau * result def attach(self, model): """ @@ -378,7 +411,9 @@

    Module topicnet.cooking_machine.models.thetaless_regular f" should be set to {1} to correctly emulate a thetaless inference process" ) - self.modalities_to_use = model.class_ids.keys() + if not self.modalities_to_use: + self.modalities_to_use = model.class_ids.keys() + bv = artm.BatchVectorizer(data_path=self._batches_path, data_format='batches') self._initialize_matrices(bv, self.token2id) @@ -413,8 +448,9 @@

    Functions

    """ dictionary_data = artm_dict._master.get_dictionary(artm_dict._name) - dict_pandas = {field: getattr(dictionary_data, field) + dict_pandas = {field: list(getattr(dictionary_data, field)) for field in FIELDS} + return pd.DataFrame(dict_pandas)

    @@ -467,12 +503,12 @@

    Functions

    -def dataset2sparse_matrix(dataset, modality, modalities_to_use=None) +def dataset2sparse_matrix(dataset, modality, modalities_to_use=None, remove_nans=True)
    -

    Builds a sparse matrix from batch_vectorizer linked to the Dataset

    +

    Builds a sparse matrix from batch_vectorizer linked to the Dataset.

    If you need an inverse mapping:

    -
    >>> d = sparse_n_dw_matrix.todok()  # convert to dictionary of keys format
    +
    >>> d = sparse_n_dw_matrix.todok()  # convert to dictionary of keys format
     >>> dict_of_csr = dict(d.items())
     

    Parameters

    @@ -481,7 +517,7 @@

    Parameters

     
    modality : str
    the remaining modalities will be ignored -(their occurrences will be replaced with zeros, but they will continue to exist)
    +(their occurrences will be replaced with zeros, but they will continue to exist).
    modalities_to_use : iterable

    a set of modalities the underlying topic model is using (this is about topic model, @@ -493,26 +529,27 @@

    Parameters

    while TopicModel usually requires this to be set inside modalities_to_use.

    If you hadn't explicitly listed any modalities yet, you probably could leave this argument as None.

    -

    If you use a single modality, wrap it into a list (e.g.['@word'])

    +

    If you use a single modality, wrap it into a list (e.g.['@word']).

    +
    remove_nans : bool
    +
    whether to re-encode values to transform NaNs in n_dw matrix to explicitly stored zeros.

    Returns

    -
    n_dw_matrix : scipy.sparse.csr_matrix -
    -
    The matrix of document-word occurrences.
    -n_dw is a number of the occurrences of the word w in the document d
    -this matrix determines the dependence between the Theta and Phi matrices
    -(Phi is the result of one iteration of the ARTM's EM algorihtm
    -with uniform theta initialization and n_dw matrix of the document-word occurrences)
    +
    n_dw_matrix : scipy.sparse.csr_matrix
    +
    the matrix of document-word occurrences +(n_dw is a number of the occurrences of the word w in the document d.) +This matrix determines the dependence between the Theta and Phi matrices +(Phi is the result of one iteration of the ARTM's EM algorihtm +with uniform Theta initialization and n_dw matrix of the document-word occurrences).
    Expand source code -
    def dataset2sparse_matrix(dataset, modality, modalities_to_use=None):
    +
    def dataset2sparse_matrix(dataset, modality, modalities_to_use=None, remove_nans=True):
         """
    -    Builds a sparse matrix from batch_vectorizer linked to the Dataset
    +    Builds a sparse matrix from batch_vectorizer linked to the Dataset.
     
         If you need an inverse mapping:
     
    @@ -524,7 +561,7 @@ 

    Returns

    dataset: Dataset modality: str the remaining modalities will be ignored - (their occurrences will be replaced with zeros, but they will continue to exist) + (their occurrences will be replaced with zeros, but they will continue to exist). modalities_to_use: iterable a set of modalities the underlying topic model is using (this is about topic model, not regularizer; this parameter ensures that the shapes of n_dw matrix and actual @@ -539,23 +576,26 @@

    Returns

    If you hadn't explicitly listed any modalities yet, you probably could leave this argument as None. - If you use a single modality, wrap it into a list (e.g.['@word']) + If you use a single modality, wrap it into a list (e.g.['@word']). + remove_nans: bool + whether to re-encode values to transform NaNs in n_dw matrix to explicitly stored zeros. Returns ------- - n_dw_matrix: scipy.sparse.csr_matrix - The matrix of document-word occurrences. - `n_dw` is a number of the occurrences of the word `w` in the document `d` - this matrix determines the dependence between the Theta and Phi matrices - (Phi is the result of one iteration of the ARTM's EM algorihtm - with uniform theta initialization and `n_dw` matrix of the document-word occurrences) + n_dw_matrix: scipy.sparse.csr_matrix + the matrix of document-word occurrences + (`n_dw` is a number of the occurrences of the word `w` in the document `d`.) + This matrix determines the dependence between the Theta and Phi matrices + (Phi is the result of one iteration of the ARTM's EM algorihtm + with uniform Theta initialization and `n_dw` matrix of the document-word occurrences). + """ # noqa: W291 token2id = obtain_token2id(dataset) batch_vectorizer = dataset.get_batch_vectorizer() return _batch_vectorizer2sparse_matrix( - batch_vectorizer, token2id, modality, modalities_to_use + batch_vectorizer, token2id, modality, modalities_to_use, remove_nans )
    @@ -718,7 +758,7 @@

    Classes

    class ThetalessRegularizer -(name, tau, modality, dataset: Dataset) +(name, tau, modality, dataset: Dataset, modalities_to_use=None)

    Base regularizer class to construct custom regularizers.

    @@ -728,22 +768,35 @@

    Classes

    Parameters

    name : str
    -
    name of the regularizer
    +
    name of the regularizer.
    tau : Number
    according to the math, tau should be set to 1 (to correctly emulate a different
    inference process). But you do you, it's not like there's a regularizer
    police or something.
    modality : str
    -
    name of modality on which the inference should be based
    -
    dataset
    -
    will be transformed to n_dw_matrix
    +
    name of modality on which the inference should be based.
    +
    dataset : Dataset
    +
    will be transformed to n_dw_matrix.
    +
    modalities_to_use : iterable
    +
    +

    a set of modalities the underlying topic model is using (this is about topic model, +not regularizer; this parameter ensures that the shapes of n_dw matrix and actual +Phi matrix match).

    +

    The tokens outside of this list will be discarded utterly +(the resulting matrix will have no entries corresponding to them)

    +

    For artm.ARTM() models, you need to pass whatever is inside class_ids; +while TopicModel usually requires this to be set inside modalities_to_use.

    +

    If you hadn't explicitly listed any modalities yet, you probably could +leave this argument as None.

    +

    If you use a single modality, wrap it into a list (e.g.['@word']).

    +
    Expand source code
    class ThetalessRegularizer(BaseRegularizer):
    -    def __init__(self, name, tau, modality, dataset: Dataset):
    +    def __init__(self, name, tau, modality, dataset: Dataset, modalities_to_use=None):
             """
             A regularizer based on a "thetaless" topic model inference
     
    @@ -753,20 +806,36 @@ 

    Parameters

    Parameters ---------- name: str - name of the regularizer + name of the regularizer. tau: Number according to the math, `tau` should be set to 1 (to correctly emulate a different inference process). But you do you, it's not like there's a regularizer police or something. modality: str - name of modality on which the inference should be based - dataset - will be transformed to n_dw_matrix + name of modality on which the inference should be based. + dataset: Dataset + will be transformed to n_dw_matrix. + modalities_to_use: iterable + a set of modalities the underlying topic model is using (this is about topic model, + not regularizer; this parameter ensures that the shapes of n_dw matrix and actual + Phi matrix match). + + The tokens outside of this list will be discarded utterly + (the resulting matrix will have no entries corresponding to them) + + For artm.ARTM() models, you need to pass whatever is inside class_ids; + while TopicModel usually requires this to be set inside modalities_to_use. + + If you hadn't explicitly listed any modalities yet, you probably could + leave this argument as None. + + If you use a single modality, wrap it into a list (e.g.['@word']). + """ # noqa: W291 super().__init__(name, tau) self.modality = modality - self.modalities_to_use = None + self.modalities_to_use = modalities_to_use self.n_dw_matrix = None self.token2id = obtain_token2id(dataset) @@ -774,8 +843,14 @@

    Parameters

    def _initialize_matrices(self, batch_vectorizer, token2id): self.n_dw_matrix = _batch_vectorizer2sparse_matrix( - batch_vectorizer, token2id, self.modality, self.modalities_to_use + batch_vectorizer, token2id, + self.modality, self.modalities_to_use, + remove_nans=False, ) + ind = self.n_dw_matrix.sum(axis=0) + self.modalities_mask = np.ravel((ind == ind)) # detects not-NaN-s + self.n_dw_matrix.data = np.nan_to_num(self.n_dw_matrix.data) + self.B = scipy.sparse.csr_matrix( ( 1. * self.n_dw_matrix.data / calc_docsizes(self.n_dw_matrix), @@ -817,7 +892,10 @@

    Parameters

    tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return self.tau * (n_tw.T - nwt) + result = n_tw.T - nwt + result = (result.T * self.modalities_mask).T + + return self.tau * result def attach(self, model): """ @@ -833,7 +911,9 @@

    Parameters

    f" should be set to {1} to correctly emulate a thetaless inference process" ) - self.modalities_to_use = model.class_ids.keys() + if not self.modalities_to_use: + self.modalities_to_use = model.class_ids.keys() + bv = artm.BatchVectorizer(data_path=self._batches_path, data_format='batches') self._initialize_matrices(bv, self.token2id) @@ -884,7 +964,10 @@

    Methods

    tmp = g_dt.T * self.B / (phi_matrix_tr.sum(axis=1) + EPS) n_tw += (tmp - np.einsum('ij,ji->i', phi_rev_matrix, tmp)) * phi_matrix - return self.tau * (n_tw.T - nwt)
    + result = n_tw.T - nwt + result = (result.T * self.modalities_mask).T + + return self.tau * result
    @@ -937,9 +1020,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/topic_model.html b/docs/cooking_machine/models/topic_model.html index 8d92d6b..dc52b10 100644 --- a/docs/cooking_machine/models/topic_model.html +++ b/docs/cooking_machine/models/topic_model.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -243,7 +245,7 @@

    Module topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelModule topicnet.cooking_machine.models.topic_modelClasses

    class TopicModel -(artm_model: artm.artm_model.ARTM = None, model_id: str = None, parent_model_id: str = None, data_path: str = None, description: List[Dict[str, Any]] = None, experiment=None, callbacks: List[ControllerAgent] = None, custom_scores: Dict[str, topicnet.cooking_machine.models.base_score.BaseScore] = None, custom_regularizers: Dict[str, topicnet.cooking_machine.models.base_regularizer.BaseRegularizer] = None, *args, **kwargs) +(artm_model: artm.artm_model.ARTM = None, model_id: str = None, parent_model_id: str = None, data_path: str = None, description: List[Dict[str, Any]] = None, experiment=None, callbacks: List[ControllerAgent] = None, custom_scores: Dict[str, BaseScore] = None, custom_regularizers: Dict[str, BaseRegularizer] = None, *args, **kwargs)

    Topic Model contains artm model and all necessary information: scores, training pipeline, etc.

    @@ -1139,7 +1143,7 @@

    Parameters

    for cur_iter in range(num_iterations): precomputed_data = dict() - iter_is_last = cur_iter == num_iterations - 1 + iter_is_last = (cur_iter == num_iterations - 1) self._model.fit_offline(batch_vectorizer=dataset_trainable, num_collection_passes=1) @@ -1172,8 +1176,10 @@

    Parameters

    custom_score.update(score) self._model.score_tracker[name] = custom_score - except AttributeError: # TODO: means no "call" attribute? - raise AttributeError(f'Score {name} doesn\'t have a desired attribute') + except AttributeError as error: # TODO: means no "call" attribute? + raise AttributeError( + f'Seems that score "{name}" doesn\'t have a desired attribute...' + ) from error # TODO: think about performance issues for callback_agent in self.callbacks: @@ -1208,7 +1214,7 @@

    Parameters

    regularizer_tau=base_regularizers_tau) (meta, nd_array) = self._model.master.attach_model(rwt_name) - attached_rwt = pd.DataFrame(data=nd_array, columns=meta.topic_name, index=meta.token) + attached_rwt = pd.DataFrame(data=nd_array, columns=list(meta.topic_name), index=list(meta.token)) for regularizer in custom_regularizers.values(): attached_rwt.values[:, :] += regularizer.grad(pwt, nwt) @@ -1346,8 +1352,8 @@

    Parameters

    score_object.save(save_path) except pickle.PicklingError: warnings.warn( - f'Failed to save custom score "{score_object}" correctly! ' - f'Freezing score (saving only its value)' + f'Failed to save custom score "{score_object}" correctly!' + f' Freezing score (saving only its value)' ) frozen_score_object = FrozenScore( @@ -2394,7 +2400,7 @@

    Returns

    -def get_regularizer(self, reg_name: str) -> Union[BaseRegularizer, artm.regularizers.BaseRegularizer] +def get_regularizer(self, reg_name: str) ‑> Union[BaseRegularizer, artm.regularizers.BaseRegularizer]

    Retrieves the regularizer specified, no matter is it custom or "classic"

    @@ -2547,7 +2553,7 @@

    Notes

    but there is no ARTM model inside! (so model.get_phi() won't work!) If one wants to use the topic model as before, this ARTM model should be restored first:

    -
    >>> save_path = topic_model.model_default_save_path
    +
    >>> save_path = topic_model.model_default_save_path
     >>> topic_model._model = artm.load_artm_model(f'{save_path}/model')
     
    @@ -2673,8 +2679,8 @@

    Parameters

    score_object.save(save_path) except pickle.PicklingError: warnings.warn( - f'Failed to save custom score "{score_object}" correctly! ' - f'Freezing score (saving only its value)' + f'Failed to save custom score "{score_object}" correctly!' + f' Freezing score (saving only its value)' ) frozen_score_object = FrozenScore( @@ -2892,9 +2898,7 @@

    - - \ No newline at end of file diff --git a/docs/cooking_machine/models/topic_prior_regularizer.html b/docs/cooking_machine/models/topic_prior_regularizer.html index 7f31094..03e1462 100644 --- a/docs/cooking_machine/models/topic_prior_regularizer.html +++ b/docs/cooking_machine/models/topic_prior_regularizer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -417,9 +419,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/pretty_output.html b/docs/cooking_machine/pretty_output.html index 597d6aa..b8a9f3f 100644 --- a/docs/cooking_machine/pretty_output.html +++ b/docs/cooking_machine/pretty_output.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -310,7 +312,7 @@

    Module topicnet.cooking_machine.pretty_output

    Returns

    Expand source code
    def make_notebook_pretty():
    -    from IPython.core.display import display, HTML
    +    from IPython.display import display, HTML
     
         display(HTML("""<style>
         div.output_html {
    @@ -793,9 +795,7 @@ 

    Index

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/artm_baseline_pipeline.html b/docs/cooking_machine/recipes/artm_baseline_pipeline.html index 9c2ee47..ba0b5e7 100644 --- a/docs/cooking_machine/recipes/artm_baseline_pipeline.html +++ b/docs/cooking_machine/recipes/artm_baseline_pipeline.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -239,9 +241,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/exploratory_search_pipeline.html b/docs/cooking_machine/recipes/exploratory_search_pipeline.html index 28b8686..6b96e6a 100644 --- a/docs/cooking_machine/recipes/exploratory_search_pipeline.html +++ b/docs/cooking_machine/recipes/exploratory_search_pipeline.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -247,9 +249,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/index.html b/docs/cooking_machine/recipes/index.html index 603a2bc..99bf912 100644 --- a/docs/cooking_machine/recipes/index.html +++ b/docs/cooking_machine/recipes/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -19,6 +21,42 @@

    Module topicnet.cooking_machine.recipes

    +

    TopicNet Recipes

    +

    This module contains mechanisms to generate code for training topic +models on your data. It was created in orded to simplify knowledge +transition about model training from the field researchers to the end +users and possibly for easier exchange of a code between the research +groups. As a backbone it uses snippets of YAML configs that require +filling in information about the collection and hyperparameters of the +required topic model. Currently it is recommended to import +BaselineRecipe, SearchRecipe, +MultimodalSearchRecipe classes for the experiment +environment generation. However, for the compatibility with previous +examples found in topicnet/demos/*-Recipe.ipynb notebooks +we also have ARTM-baseline and +exploratory_search configs in YAML format.

    +
    +
      +
    • BaselineRecipe - Class for generating a pipeline +training a topic models with decorrelation regularization, maximizing +custom BleiLafferty score from TopicNet library +topicnet.cooking_machine.models.scores.BleiLaffertyScore.
    • +
    • SearchRecipe - a Class recreating training scenario +from exploratory_search YAML config. Provides good startegy +for training topic models for collection search properties. A link to +the publication can be found in the comments section of the recipe.
    • +
    • MultimodalSearchRecipe - a Class that modifies previos +strategy for the case of multimodal data allowing to recreate previous +scenario for each modality separately.
    • +
    • intratext_coherence_maximization.yml - a strin in YAML +format (like the old recipes) allowing to build topic model with +decorrelation, Phi and Theta matrices Sparsing and Smoothing with +background topics maximizing the intratext coherence score +topicnet.cooking_machine.models.scores.IntratextCoherenceScore.
    • +
    • topic_number_search.yml - a recipe recreating published +strategy to find optimal topic number for given dataset. References to +the publication can be found in the config dosctring.
    • +
    @@ -69,7 +107,7 @@

    Sub-modules

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/intratext_coherence_pipeline.html b/docs/cooking_machine/recipes/intratext_coherence_pipeline.html index 962db37..1d6b320 100644 --- a/docs/cooking_machine/recipes/intratext_coherence_pipeline.html +++ b/docs/cooking_machine/recipes/intratext_coherence_pipeline.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -341,7 +343,7 @@

    Ancestors

    Methods

    -def format_recipe(self, dataset_path: str, num_specific_topics: int, main_modality: str = None, dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, keep_dataset: bool = False, documents_fraction: float = 0.5, one_stage_num_iter: int = 20, verbose: bool = True) -> str +def format_recipe(self, dataset_path: str, num_specific_topics: int, main_modality: str = None, dictionary_filter_parameters: dict = None, num_background_topics: int = 1, modalities: List[str] = None, keep_dataset_in_memory: bool = True, keep_dataset: bool = False, documents_fraction: float = 0.5, one_stage_num_iter: int = 20, verbose: bool = True) ‑> str

    Parameters

    @@ -553,9 +555,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html b/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html index cf9953b..c59bc13 100644 --- a/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html +++ b/docs/cooking_machine/recipes/multimodal_exploratory_search_pipeline.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -649,9 +651,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/recipe_wrapper.html b/docs/cooking_machine/recipes/recipe_wrapper.html index b4173ef..6294e4b 100644 --- a/docs/cooking_machine/recipes/recipe_wrapper.html +++ b/docs/cooking_machine/recipes/recipe_wrapper.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -234,7 +236,7 @@

    Subclasses

    Methods

    -def build_experiment_environment(self, save_path: str, experiment_id: str = 'default_experiment_name', force_separate_thread: bool = False) -> Tuple[ExperimentDataset] +def build_experiment_environment(self, save_path: str, experiment_id: str = 'default_experiment_name', force_separate_thread: bool = False) ‑> Tuple[ExperimentDataset]

    Returns experiment and dataset instances @@ -289,7 +291,7 @@

    Parameters

    -def format_recipe(self, *args, **kwargs) -> str +def format_recipe(self, *args, **kwargs) ‑> str

    Updates self._recipe @@ -339,9 +341,7 @@

    - - \ No newline at end of file diff --git a/docs/cooking_machine/recipes/wntm.html b/docs/cooking_machine/recipes/wntm.html index 05bb7cb..47167cb 100644 --- a/docs/cooking_machine/recipes/wntm.html +++ b/docs/cooking_machine/recipes/wntm.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -254,7 +256,7 @@

    Ancestors

    Methods

    -def build_experiment_environment(self, save_path: str, experiment_id: str = 'default_experiment_name', force_separate_thread: bool = False) -> Tuple[ExperimentDataset] +def build_experiment_environment(self, save_path: str, experiment_id: str = 'default_experiment_name', force_separate_thread: bool = False) ‑> Tuple[ExperimentDataset]

    Returns experiment and dataset instances @@ -346,9 +348,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/cooking_machine/rel_toolbox_lite.html b/docs/cooking_machine/rel_toolbox_lite.html index 2a9d352..32f9b62 100644 --- a/docs/cooking_machine/rel_toolbox_lite.html +++ b/docs/cooking_machine/rel_toolbox_lite.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -25,11 +27,12 @@

    Module topicnet.cooking_machine.rel_toolbox_lite< Expand source code
    import os
    +import uuid
     
     
     def count_vocab_size(dictionary, modalities):
         # TODO: check tokens filtered by dict.filter()
    -    fname = 'tmp.txt'
    +    fname = str(uuid.uuid4()) + '.txt'  # Plain 'tmp.txt' may fail if several processes work with the same file
         try:
             dictionary.save_text(fname)
             modality_count = {name: 0 for name in modalities}
    @@ -334,7 +337,7 @@ 

    Functions

    def count_vocab_size(dictionary, modalities):
         # TODO: check tokens filtered by dict.filter()
    -    fname = 'tmp.txt'
    +    fname = str(uuid.uuid4()) + '.txt'  # Plain 'tmp.txt' may fail if several processes work with the same file
         try:
             dictionary.save_text(fname)
             modality_count = {name: 0 for name in modalities}
    @@ -593,9 +596,7 @@ 

    Index

    - - \ No newline at end of file diff --git a/docs/cooking_machine/routine.html b/docs/cooking_machine/routine.html index a624a2c..662acfb 100644 --- a/docs/cooking_machine/routine.html +++ b/docs/cooking_machine/routine.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -853,7 +855,7 @@

    Returns

    -def choose_value_for_models_num_and_check(models_num_as_parameter, models_num_from_query) -> int +def choose_value_for_models_num_and_check(models_num_as_parameter, models_num_from_query) ‑> int
    @@ -1013,7 +1015,7 @@

    Returns

    -def get_equal_lists(one_dict, min_len: int = 0, sep: str = ' ', sep_len='last') +def get_equal_lists(one_dict, min_len: int = 0, sep: str = ' ', sep_len='last')

    Transforms all lists to list with the same length, but not less that min_len. @@ -1064,7 +1066,7 @@

    Parameters

    -def get_equal_strings(strings, min_len: int = 0, sep: str = ' ') +def get_equal_strings(strings, min_len: int = 0, sep: str = ' ')

    Transforms all strings to strings with the same length, but not less that min_len. @@ -1692,9 +1694,7 @@

    Index

    - - \ No newline at end of file diff --git a/docs/dataset_manager/api.html b/docs/dataset_manager/api.html index 764a318..c40b7a2 100644 --- a/docs/dataset_manager/api.html +++ b/docs/dataset_manager/api.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -43,7 +45,7 @@

    Module topicnet.dataset_manager.api

    from ..cooking_machine.dataset import Dataset -_SERVER_URL = 'https://topicnet-datasets.machine-intelligence.ru' +_SERVER_URL = 'https://topicnet-datasets.mil-team.ru' _ARCHIVE_EXTENSION = '.gz' _DEFAULT_DATASET_FILE_EXTENSION = '.csv' @@ -191,7 +193,7 @@

    Module topicnet.dataset_manager.api

    Functions

    -def get_info() -> str +def get_info() ‑> str

    Gets info about all datasets.

    @@ -204,7 +206,7 @@

    Examples

    As the return value is MarkDown text, in Jupyter Notebook one may do the following to format the output information nicely

    -
    >>> from IPython.display import Markdown
    +
    >>> from IPython.display import Markdown
     ...
     >>> Markdown(get_info())
     
    @@ -239,7 +241,7 @@

    Examples

    -def load_dataset(dataset_name: str, **kwargs) -> Dataset +def load_dataset(dataset_name: str, **kwargs) ‑> Dataset

    Load dataset by dataset_name. @@ -371,9 +373,7 @@

    Index

    - - \ No newline at end of file diff --git a/docs/dataset_manager/index.html b/docs/dataset_manager/index.html index 3b81a4b..80a0215 100644 --- a/docs/dataset_manager/index.html +++ b/docs/dataset_manager/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -66,9 +68,7 @@

    Index

    - - \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index dc0efc1..6f10a53 100644 --- a/docs/index.html +++ b/docs/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -19,6 +21,39 @@

    Package topicnet

    +

    TopicNet

    +

    The library was created to assist in the task of building topic +models. It aims to automate away many routine tasks related to topic +model training, allowing a user to focus on the task at hand. Also, it +provides additional tools to construct advanced topic models. The +library consists of the following modules:

    +
      +
    • cooking_machine — provides tools to design a topic +model construction pipeline, or experiment with regularizers +fitting
    • +
    • viewers — provides information about the topic model in +an accessible format
    • +
    • demos — demo .ipynb notebooks
    • +
    • dataset_manager — gives opportunity to download +datasets for experiments
    • +
    • tests — provides a user with means to test library +functionality (contains some examples of intended library usage)
    • +
    +

    Project description

    +

    In TopicNet framework, advanced topic models are build using +Experiment class. An experiment consists of stages (that we call +“cubes”) which perform actions over the “models” which are objects of +the Experiment. The experiment instance of Experiment class contains all +the information about the experiment process and automatically updates +its log when a cube is applied to the last level models. It is worth +noting that the experiment is linear, meaning it does not support +multiple different cubes at the same stage of the experiment. If that +need arises one is recommended to create a new experiment with a new +cube on the last level. The experiment instance of Experiment class +contains all the information about the experiment process and +automatically updates its log when the cube is applied to the last level +models. Summarizing: the key entity Experiment is a sequence of cubes +that produce models on each stage of the experiment process

    @@ -32,7 +67,10 @@

    Package topicnet

    lib = artm.wrapper.LibArtm(logging_config=lc) -__pdoc__ = {"tests": False}
    +__pdoc__ = { + "embeddings": False, + "tests": False, +}
    @@ -60,7 +98,7 @@

    Sub-modules

    - - \ No newline at end of file diff --git a/docs/viewers/base_viewer.html b/docs/viewers/base_viewer.html index 8352072..b60acb4 100644 --- a/docs/viewers/base_viewer.html +++ b/docs/viewers/base_viewer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -181,9 +183,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/viewers/document_cluster.html b/docs/viewers/document_cluster.html index f9d17dd..7765a70 100644 --- a/docs/viewers/document_cluster.html +++ b/docs/viewers/document_cluster.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -548,9 +550,7 @@

    - - \ No newline at end of file diff --git a/docs/viewers/index.html b/docs/viewers/index.html index 1051a57..5123558 100644 --- a/docs/viewers/index.html +++ b/docs/viewers/index.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -19,6 +21,103 @@

    Module topicnet.viewers

    +

    Viewers

    +

    Module viewers provides information from a topic model +allowing to estimate the model quality. Its advantage is in unified call +ifrastucture to the topic model making the routine and tedious task of +extracting the information easy.

    +

    Currently module contains the following viewers:

    +

    base_viewer +(BaseViewer)

    +

    Module responsible for base infrastructure.

    +

    document_cluster +(DocumentClusterViewer)

    +

    Module which allows to visualize collection documents. May be slow +for large document collections as it uses TSNE algorithm from sklearn +library.

    +

    +

    +
        <img src="../docs/images/doc_cluster__plot.png" width="80%" alt/>
    +</div>
    +<em>
    +    Visualisation of reduced document embeddings colored according to their topic made by DocumentClusterViewer.
    +</em>
    +

    +

    spectrum +(TopicSpectrumViewer)

    +

    Module contains heuristics for solving TSP to arrange topics +minimizing total distance of the spectrum.

    +

    +

    +
        <img src="../docs/images/topic_spectrum__refined_view.png" width="80%" alt/>
    +</div>
    +<em>
    +    Each point on the plot represents some topic.
    +    The viewer helped to calculate such a route between topics when one topic is connected with similar one, and so on, forming a circle.
    +</em>
    +

    +

    top_documents_viewer +(TopDocumentsViewer)

    +

    Module with functions that work with dataset document +collections.

    +

    +

    +
        <img src="../docs/images/top_doc__view.png" width="80%" alt/>
    +</div>
    +<em>
    +    The viewer shows fragments of top documents corresponding to some topic.
    +</em>
    +

    +

    top_similar_documents_viewer +(TopSimilarDocumentsViewer)

    +

    Module containing class for finding similar document for a given one. +This viewer helps to estimate homogeneity of clusters given by the +model.

    +

    +

    +
        <img src="../docs/images/top_sim_doc__refined_view.png" width="80%" alt/>
    +</div>
    +<em>
    +    Some document from text collection (on top), and documents nearest to it given topic model.
    +    The viewer (currently) gives only document names as output, but the picture is not very difficult to be made.
    +</em>
    +

    +

    top_tokens_viewer +(TopTokensViewer)

    +

    Module with class for displaying the most relevant tokens in each +topic of the model.

    +

    +

    +
        <img src="../docs/images/top_tokens__view.png" width="80%" alt/>
    +</div>
    +<em>
    +    Output of the TopTokensViewer. Token score in the topic is calculated for every token, score function can be specified at the stage of a viewer initialization.
    +</em>
    +

    +

    topic_mapping +(TopicMapViewer)

    +

    Module allowing to compare topics between two different models +trained on the same collection.

    +

    +

    +
        <img src="../docs/images/topic_map__view.png" width="80%" alt/>
    +</div>
    +<em>
    +    The mapping between topics of two models (currently only topic names are displayed).
    +</em>
    +

    +

    Deprecated

    +
      +
    • initial_doc_to_topic_viewer — first edition of +TopDocumentsViewer

    • +
    • tokens_viewer - first edition of +TopTokensViewer

    • +
    @@ -84,7 +183,7 @@

    Sub-modules

    - - \ No newline at end of file diff --git a/docs/viewers/initial_doc_to_topic_viewer.html b/docs/viewers/initial_doc_to_topic_viewer.html index e071870..cae535d 100644 --- a/docs/viewers/initial_doc_to_topic_viewer.html +++ b/docs/viewers/initial_doc_to_topic_viewer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -235,9 +237,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/viewers/spectrum.html b/docs/viewers/spectrum.html index ee5e06a..90d3fd4 100644 --- a/docs/viewers/spectrum.html +++ b/docs/viewers/spectrum.html @@ -3,17 +3,19 @@ - + Codestin Search App - - - - + + + + + +
    @@ -1354,9 +1356,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/viewers/top_documents_viewer.html b/docs/viewers/top_documents_viewer.html index 3977fb2..083e85d 100644 --- a/docs/viewers/top_documents_viewer.html +++ b/docs/viewers/top_documents_viewer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -1007,9 +1009,7 @@

    - - \ No newline at end of file diff --git a/docs/viewers/top_similar_documents_viewer.html b/docs/viewers/top_similar_documents_viewer.html index bab1db1..939c889 100644 --- a/docs/viewers/top_similar_documents_viewer.html +++ b/docs/viewers/top_similar_documents_viewer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -1303,9 +1305,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/viewers/top_tokens_viewer.html b/docs/viewers/top_tokens_viewer.html index 13ec518..e3f2970 100644 --- a/docs/viewers/top_tokens_viewer.html +++ b/docs/viewers/top_tokens_viewer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -149,7 +151,7 @@

    Module topicnet.viewers.top_tokens_viewer

    ---------- phi : pd.Dataframe phi matrix of the model - p_t : np.array of float + p_t : pd.Series probability that a random token from the collection belongs to that topic Returns @@ -160,7 +162,7 @@

    Module topicnet.viewers.top_tokens_viewer

    """ # noqa: W291 - joint_pwt = p_t[:, np.newaxis] * phi.transpose() + joint_pwt = p_t.to_numpy()[:, np.newaxis] * phi.transpose() return joint_pwt @@ -198,7 +200,7 @@

    Module topicnet.viewers.top_tokens_viewer

    # denominator will have the same Index/Columns as them # TODO: check equality denominator = (np.sum(joint_pwt, axis=0) - joint_pwt) - multiplier = (1 - p_t)[:, np.newaxis] + multiplier = (1 - p_t).to_numpy()[:, np.newaxis] if hasattr(phi, "values"): numerator = phi.values.transpose() * multiplier else: @@ -213,6 +215,7 @@

    Module topicnet.viewers.top_tokens_viewer

    return target_values +# TODO: copy-paste from BleiLaffertyScore def compute_blei_scores(phi): """ Computes Blei score @@ -234,7 +237,7 @@

    Module topicnet.viewers.top_tokens_viewer

    blei_eps = 1e-42 log_phi = np.log(phi + blei_eps) denominator = np.sum(log_phi, axis=0) - denominator = denominator[np.newaxis, :] + denominator = denominator.to_numpy()[np.newaxis, :] if hasattr(log_phi, "values"): multiplier = log_phi.values - denominator / topic_number @@ -753,7 +756,7 @@

    Returns

    blei_eps = 1e-42 log_phi = np.log(phi + blei_eps) denominator = np.sum(log_phi, axis=0) - denominator = denominator[np.newaxis, :] + denominator = denominator.to_numpy()[np.newaxis, :] if hasattr(log_phi, "values"): multiplier = log_phi.values - denominator / topic_number @@ -875,7 +878,7 @@

    Parameters

    phi : pd.Dataframe
    phi matrix of the model
    -
    p_t : np.array of float
    +
    p_t : pd.Series
    probability that a random token from the collection belongs to that topic

    Returns

    @@ -899,7 +902,7 @@

    Returns

    ---------- phi : pd.Dataframe phi matrix of the model - p_t : np.array of float + p_t : pd.Series probability that a random token from the collection belongs to that topic Returns @@ -910,7 +913,7 @@

    Returns

    """ # noqa: W291 - joint_pwt = p_t[:, np.newaxis] * phi.transpose() + joint_pwt = p_t.to_numpy()[:, np.newaxis] * phi.transpose() return joint_pwt

    @@ -975,7 +978,7 @@

    Returns

    # denominator will have the same Index/Columns as them # TODO: check equality denominator = (np.sum(joint_pwt, axis=0) - joint_pwt) - multiplier = (1 - p_t)[:, np.newaxis] + multiplier = (1 - p_t).to_numpy()[:, np.newaxis] if hasattr(phi, "values"): numerator = phi.values.transpose() * multiplier else: @@ -1145,7 +1148,7 @@

    Returns

    array of original indexes for top_values array (Default value = True)

    Examples

    -
    >>> values = np.array([1, 3, 2, 0.1, 5, 0])
    +
    >>> values = np.array([1, 3, 2, 0.1, 5, 0])
     >>> min_sum = 8.1
     >>> top_values, top_indexes = get_top_values_by_sum(values, min_sum)
     Result: top_values, top_indexes = (array([5., 3., 2.]), array([4, 1, 2]))
    @@ -1646,7 +1649,7 @@ 

    Instance variables

    Methods

    -def to_df(self, topic_names: Iterator[str] = None, digits: int = 5) -> pandas.core.frame.DataFrame +def to_df(self, topic_names: Iterator[str] = None, digits: int = 5) ‑> pandas.core.frame.DataFrame
    @@ -1674,7 +1677,7 @@

    Methods

    -def to_html(self, topic_names: Union[str, List[str]] = None, digits: int = 5, thresh: float = None, horizontally_stack: bool = True) -> str +def to_html(self, topic_names: Union[str, List[str]] = None, digits: int = 5, thresh: float = None, horizontally_stack: bool = True) ‑> str

    Generates html version of dataframes to be displayed by Jupyter notebooks

    @@ -1691,7 +1694,7 @@

    Parameters

    (instead of being a single long multi-line DataFrame)

    Examples

    -
    >>> from IPython.display import HTML, display_html
    +
    >>> from IPython.display import HTML, display_html
     >>>
     >>> # model training here
     >>> # ...
    @@ -1774,7 +1777,7 @@ 

    Examples

    -def view(self, class_ids: List[str] = None, raw_data: List[List[str]] = None, three_levels: bool = True) -> Union[Dict[str, Dict[str, Dict[str, float]]], Dict[str, Dict[Tuple[str, str], float]]] +def view(self, class_ids: List[str] = None, raw_data: List[List[str]] = None, three_levels: bool = True) ‑> Union[Dict[str, Dict[str, Dict[str, float]]], Dict[str, Dict[Tuple[str, str], float]]]

    Returns list of tuples (token, score) for each topic in the model.

    @@ -1915,7 +1918,7 @@

    Returns

     

    Examples

    -
    >>> # model training here
    +
    >>> # model training here
     >>> # ...
     >>> viewer = TopTokensViewer(model)
     >>> information = viewer.view_from_jupyter()
    @@ -2053,9 +2056,7 @@ 

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/viewers/topic_flow_viewer.html b/docs/viewers/topic_flow_viewer.html index 17f491b..d8abb7e 100644 --- a/docs/viewers/topic_flow_viewer.html +++ b/docs/viewers/topic_flow_viewer.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -527,9 +529,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/docs/viewers/topic_mapping.html b/docs/viewers/topic_mapping.html index 4d14735..44e5349 100644 --- a/docs/viewers/topic_mapping.html +++ b/docs/viewers/topic_mapping.html @@ -3,15 +3,17 @@ - + Codestin Search App - - - - + + + + + +
    @@ -825,9 +827,7 @@

    -

    Generated by pdoc 0.8.1.

    +

    Generated by pdoc 0.9.0.

    - - \ No newline at end of file diff --git a/topicnet/__init__.py b/topicnet/__init__.py index a46c06c..948830d 100644 --- a/topicnet/__init__.py +++ b/topicnet/__init__.py @@ -6,4 +6,7 @@ lib = artm.wrapper.LibArtm(logging_config=lc) -__pdoc__ = {"tests": False} +__pdoc__ = { + "embeddings": False, + "tests": False, +} diff --git a/topicnet/cooking_machine/dataset.py b/topicnet/cooking_machine/dataset.py index bc922a4..c9982a1 100644 --- a/topicnet/cooking_machine/dataset.py +++ b/topicnet/cooking_machine/dataset.py @@ -417,7 +417,7 @@ def from_dataframe( Another Parameters ------------------ **kwargs - *kwargs* are optional init `topicnet.Dataset` parameters + *kwargs* are optional init parameters """ data_path = os.path.join(save_dataset_path, dataframe_name + '.csv') dataframe.to_csv(data_path) diff --git a/topicnet/topicnet_doc_generation/README.md b/topicnet/topicnet_doc_generation/README.md index 405378b..7a5a420 100644 --- a/topicnet/topicnet_doc_generation/README.md +++ b/topicnet/topicnet_doc_generation/README.md @@ -29,7 +29,7 @@ pdoc --html -o topicnet_doc_generation/ --force ../topicnet ## Проблемы * `pdoc` не распознает команду. -Проверьте, что установлена версия `pdoc 0.6.3`. +Проверьте, что установлена версия `pdoc3==0.9.0`. * Скрипт по встраиванию Markdown отрабатывает с ошибкой. Скорее всего, это нормально: надо немного доработать скрипт, чтобы он не пытался искать html в тех папках, где его нет.