Functions for performing topic modelling to get Aurum results.

Expand source code
"""
Functions for performing topic modelling to get `ignis.aurum.Aurum` results.
"""
import time

from tqdm.auto import tqdm

import ignis
import ignis.aurum
import ignis.corpus
import ignis.models


def init_lda_model_wp(corpus_slice, model_options=None):
    """
    Prepare an `ignis.models.lda.LDAModel` for use with word priors.
    (contrib.: C. Ow)

    Parameters
    ----------
    corpus_slice: ignis.corpus.Corpus or ignis.corpus.CorpusSlice
        The `ignis.corpus.CorpusSlice` to train the model on.
    model_options: dict, optional
        Model-specific options.  See `ignis.models.lda.LDAModel` for details.

    Returns
    -------
    ignis.models.lda.LDAModel
    """
    if type(corpus_slice) is ignis.corpus.Corpus:
        corpus_slice = corpus_slice.slice_full()
    if not type(corpus_slice) is ignis.corpus.CorpusSlice:
        raise ValueError(
            "Ignis models must be instantiated with Corpus or CorpusSlice instances."
        )

    return ignis.models.LDAModel(corpus_slice, model_options)


def train_model(
    corpus_slice,
    pre_model=None,
    model_type="tp_lda",
    model_options=None,
    labeller_type=None,
    labeller_options=None,
    vis_type="pyldavis",
    vis_options=None,
):
    """
    Top-level helper for training topic models using the various algorithms available.

    Parameters
    ----------
    corpus_slice: ignis.corpus.Corpus or ignis.corpus.CorpusSlice
        The `ignis.corpus.CorpusSlice` to perform the topic modelling over.  If a
        `ignis.corpus.Corpus` is passed instead, a `ignis.corpus.CorpusSlice`
        containing all of its `ignis.corpus.Document` objects will be created.
    pre_model: ignis.models.lda.LDAModel, optional
        This is needed when you want to train a `tomotopy` LDA model with word priors.
        Default is `None`.
    model_type: {"tp_lda", "tp_hdp", "tp_lda_wp"}
        Type of model to train; corresponds to the model type listed in the relevant
        `ignis.models` class.
    model_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.models`
        model constructor.
    labeller_type: {"tomotopy"}, optional
        The type of automated labeller to use, if available.
    labeller_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.labeller`
        object constructor.
    vis_type: {"pyldavis"}, optional
        The type of visualisation data to extract, if available.
    vis_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.vis`
        object constructor.

    Returns
    -------
    ignis.aurum.Aurum
        The `ignis.aurum.Aurum` results object for the trained model, which can be used
        for further exploration and iteration.
    """
    if type(corpus_slice) is ignis.corpus.Corpus:
        corpus_slice = corpus_slice.slice_full()
    if not type(corpus_slice) is ignis.corpus.CorpusSlice:
        raise ValueError(
            "Ignis models must be instantiated with Corpus or CorpusSlice instances."
        )

    if model_type == "tp_lda":
        model = ignis.models.LDAModel(corpus_slice, model_options)
        model.train()
        aurum = ignis.aurum.Aurum(model)
    elif model_type == "tp_lda_wp":
        # Tomotopy LDA model with word priors
        # (contrib.: C. Ow)
        if isinstance(pre_model, ignis.models.lda.LDAModel):
            model = pre_model
        else:
            raise ValueError(
                "Ignis models with word priors must be pre-instantiated "
                "`ignis.models.lda.LDAModel` instances."
            )
        model.train()
        aurum = ignis.aurum.Aurum(model)
    elif model_type == "tp_hdp":
        model = ignis.models.HDPModel(corpus_slice, model_options)
        model.train()
        aurum = ignis.aurum.Aurum(model)
    else:
        raise ValueError(f"Unknown model type: '{model_type}'")

    if labeller_type is not None:
        if labeller_options is None:
            labeller_options = {}

        aurum.init_labeller(labeller_type, **labeller_options)

    if vis_type is not None:
        if vis_options is None:
            vis_options = {}

        aurum.init_vis(vis_type, **vis_options)

    return aurum


def compare_topic_count_coherence(
    corpus_slice,
    model_type="tp_lda",
    model_options=None,
    coherence="c_npmi",
    top_n=30,
    start_k=3,
    end_k=10,
    iterations=150,
    verbose=True,
):
    """
    Lightly trains models with various topic counts and reports the resultant coherence
    scores.

    These scores can be used as a heuristic for choosing the number of topics to use
    for full training (e.g., via `suggest_num_topics()`).

    Parameters
    ----------
    corpus_slice: ignis.corpus.Corpus or ignis.corpus.CorpusSlice
        The `ignis.corpus.CorpusSlice` to perform the topic modelling over.  If a
        `ignis.corpus.Corpus` is passed instead, a `ignis.corpus.CorpusSlice`
        containing all of its `ignis.corpus.Document` objects will be created.
    model_type: {"tp_lda", "tp_hdp"}
        Type of model to train; corresponds to the model type listed in the relevant
        `ignis.models` class.
    model_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.models`
        model constructor.
    coherence: {"c_npmi", "c_v", "u_mass", "c_uci"}, optional
        Coherence measure to calculate. `"c_npmi"` by default.
    top_n: int, optional
        Number of top tokens to extract from each topic when measuring coherence.
        The default of 30 matches the number of tokens shown per topic by pyLDAvis.
    start_k: int, optional
        Minimum topic count to consider.
    end_k: int, optional
        Maximum topic count to consider.
    iterations: int, optional
        Number of iterations to train each candidate model for.
    verbose: bool, optional
        Whether or not to show interim training progress.

    Returns
    -------
    iterable of tuple
        A list of tuples (`topic count`, `coherence score`) for all the topic counts
        in the test range.
    """
    if type(corpus_slice) is ignis.corpus.Corpus:
        corpus_slice = corpus_slice.slice_full()
    if not type(corpus_slice) is ignis.corpus.CorpusSlice:
        raise ValueError(
            "Ignis models must be instantiated with Corpus or CorpusSlice instances."
        )

    if model_options is None:
        model_options = {}

    progress_bar = None
    if verbose:
        total_models = end_k - start_k + 1
        print(
            f"Training {total_models} mini-models to suggest a suitable number of "
            f"topics between {start_k} and {end_k}...\n"
            f"({len(corpus_slice)} documents, {iterations} iterations each, "
            f"coherence metric: '{coherence}')"
        )
        progress_bar = tqdm(total=total_models * iterations, miniters=1)

    results = []
    candidate_counts = range(start_k, end_k + 1)
    for k in candidate_counts:
        this_options = dict(
            model_options,
            k=k,
            iterations=iterations,
            update_every=iterations,
            until_max_ll=False,
            until_max_coherence=False,
            verbose=False,
        )

        if model_type == "tp_lda":
            model = ignis.models.LDAModel(corpus_slice, this_options)
            model.train()
            results.append(
                (
                    k,
                    model.get_coherence(
                        coherence=coherence,
                        top_n=top_n,
                    ),
                )
            )

        if verbose:
            progress_bar.update(iterations)
            # To allow the tqdm bar to update, if in a Jupyter notebook
            time.sleep(0.01)

    if verbose:
        progress_bar.close()
    return results


def suggest_num_topics(*args, verbose=True, **kwargs):
    """
    Convenience function for running `compare_topic_count_coherence()` and directly
    reporting the topic count with the highest coherence found.

    Parameters
    ----------
    verbose: bool, optional
        Whether or not to print the details of the best topic count.
    *args, **kwargs
        Passed on to `compare_topic_count_coherence()`.

    Returns
    -------
    int
        The suggested topic count.
    """
    results = compare_topic_count_coherence(*args, verbose=verbose, **kwargs)
    results = sorted(results, key=lambda x: x[1], reverse=True)
    best = results[0]

    if verbose:
        print(f"Suggested topic count: {best[0]}\t" f"Coherence: {best[1]}")

    return best[0]

Functions

def compare_topic_count_coherence(corpus_slice, model_type='tp_lda', model_options=None, coherence='c_npmi', top_n=30, start_k=3, end_k=10, iterations=150, verbose=True)

Lightly trains models with various topic counts and reports the resultant coherence scores.

These scores can be used as a heuristic for choosing the number of topics to use for full training (e.g., via suggest_num_topics()).

Parameters

corpus_slice : Corpus or CorpusSlice
The CorpusSlice to perform the topic modelling over. If a Corpus is passed instead, a CorpusSlice containing all of its Document objects will be created.
model_type : {"tp_lda", "tp_hdp"}
Type of model to train; corresponds to the model type listed in the relevant ignis.models class.
model_options : dict, optional
Dictionary of options that will be passed to the relevant ignis.models model constructor.
coherence : {"c_npmi", "c_v", "u_mass", "c_uci"}, optional
Coherence measure to calculate. "c_npmi" by default.
top_n : int, optional
Number of top tokens to extract from each topic when measuring coherence. The default of 30 matches the number of tokens shown per topic by pyLDAvis.
start_k : int, optional
Minimum topic count to consider.
end_k : int, optional
Maximum topic count to consider.
iterations : int, optional
Number of iterations to train each candidate model for.
verbose : bool, optional
Whether or not to show interim training progress.

Returns

iterable of tuple
A list of tuples (topic count, coherence score) for all the topic counts in the test range.
Expand source code
def compare_topic_count_coherence(
    corpus_slice,
    model_type="tp_lda",
    model_options=None,
    coherence="c_npmi",
    top_n=30,
    start_k=3,
    end_k=10,
    iterations=150,
    verbose=True,
):
    """
    Lightly trains models with various topic counts and reports the resultant coherence
    scores.

    These scores can be used as a heuristic for choosing the number of topics to use
    for full training (e.g., via `suggest_num_topics()`).

    Parameters
    ----------
    corpus_slice: ignis.corpus.Corpus or ignis.corpus.CorpusSlice
        The `ignis.corpus.CorpusSlice` to perform the topic modelling over.  If a
        `ignis.corpus.Corpus` is passed instead, a `ignis.corpus.CorpusSlice`
        containing all of its `ignis.corpus.Document` objects will be created.
    model_type: {"tp_lda", "tp_hdp"}
        Type of model to train; corresponds to the model type listed in the relevant
        `ignis.models` class.
    model_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.models`
        model constructor.
    coherence: {"c_npmi", "c_v", "u_mass", "c_uci"}, optional
        Coherence measure to calculate. `"c_npmi"` by default.
    top_n: int, optional
        Number of top tokens to extract from each topic when measuring coherence.
        The default of 30 matches the number of tokens shown per topic by pyLDAvis.
    start_k: int, optional
        Minimum topic count to consider.
    end_k: int, optional
        Maximum topic count to consider.
    iterations: int, optional
        Number of iterations to train each candidate model for.
    verbose: bool, optional
        Whether or not to show interim training progress.

    Returns
    -------
    iterable of tuple
        A list of tuples (`topic count`, `coherence score`) for all the topic counts
        in the test range.
    """
    if type(corpus_slice) is ignis.corpus.Corpus:
        corpus_slice = corpus_slice.slice_full()
    if not type(corpus_slice) is ignis.corpus.CorpusSlice:
        raise ValueError(
            "Ignis models must be instantiated with Corpus or CorpusSlice instances."
        )

    if model_options is None:
        model_options = {}

    progress_bar = None
    if verbose:
        total_models = end_k - start_k + 1
        print(
            f"Training {total_models} mini-models to suggest a suitable number of "
            f"topics between {start_k} and {end_k}...\n"
            f"({len(corpus_slice)} documents, {iterations} iterations each, "
            f"coherence metric: '{coherence}')"
        )
        progress_bar = tqdm(total=total_models * iterations, miniters=1)

    results = []
    candidate_counts = range(start_k, end_k + 1)
    for k in candidate_counts:
        this_options = dict(
            model_options,
            k=k,
            iterations=iterations,
            update_every=iterations,
            until_max_ll=False,
            until_max_coherence=False,
            verbose=False,
        )

        if model_type == "tp_lda":
            model = ignis.models.LDAModel(corpus_slice, this_options)
            model.train()
            results.append(
                (
                    k,
                    model.get_coherence(
                        coherence=coherence,
                        top_n=top_n,
                    ),
                )
            )

        if verbose:
            progress_bar.update(iterations)
            # To allow the tqdm bar to update, if in a Jupyter notebook
            time.sleep(0.01)

    if verbose:
        progress_bar.close()
    return results
def init_lda_model_wp(corpus_slice, model_options=None)

Prepare an LDAModel for use with word priors. (contrib.: C. Ow)

Parameters

corpus_slice : Corpus or CorpusSlice
The CorpusSlice to train the model on.
model_options : dict, optional
Model-specific options. See LDAModel for details.

Returns

LDAModel
 
Expand source code
def init_lda_model_wp(corpus_slice, model_options=None):
    """
    Prepare an `ignis.models.lda.LDAModel` for use with word priors.
    (contrib.: C. Ow)

    Parameters
    ----------
    corpus_slice: ignis.corpus.Corpus or ignis.corpus.CorpusSlice
        The `ignis.corpus.CorpusSlice` to train the model on.
    model_options: dict, optional
        Model-specific options.  See `ignis.models.lda.LDAModel` for details.

    Returns
    -------
    ignis.models.lda.LDAModel
    """
    if type(corpus_slice) is ignis.corpus.Corpus:
        corpus_slice = corpus_slice.slice_full()
    if not type(corpus_slice) is ignis.corpus.CorpusSlice:
        raise ValueError(
            "Ignis models must be instantiated with Corpus or CorpusSlice instances."
        )

    return ignis.models.LDAModel(corpus_slice, model_options)
def suggest_num_topics(*args, verbose=True, **kwargs)

Convenience function for running compare_topic_count_coherence() and directly reporting the topic count with the highest coherence found.

Parameters

verbose : bool, optional
Whether or not to print the details of the best topic count.
*args, **kwargs
Passed on to compare_topic_count_coherence().

Returns

int
The suggested topic count.
Expand source code
def suggest_num_topics(*args, verbose=True, **kwargs):
    """
    Convenience function for running `compare_topic_count_coherence()` and directly
    reporting the topic count with the highest coherence found.

    Parameters
    ----------
    verbose: bool, optional
        Whether or not to print the details of the best topic count.
    *args, **kwargs
        Passed on to `compare_topic_count_coherence()`.

    Returns
    -------
    int
        The suggested topic count.
    """
    results = compare_topic_count_coherence(*args, verbose=verbose, **kwargs)
    results = sorted(results, key=lambda x: x[1], reverse=True)
    best = results[0]

    if verbose:
        print(f"Suggested topic count: {best[0]}\t" f"Coherence: {best[1]}")

    return best[0]
def train_model(corpus_slice, pre_model=None, model_type='tp_lda', model_options=None, labeller_type=None, labeller_options=None, vis_type='pyldavis', vis_options=None)

Top-level helper for training topic models using the various algorithms available.

Parameters

corpus_slice : Corpus or CorpusSlice
The CorpusSlice to perform the topic modelling over. If a Corpus is passed instead, a CorpusSlice containing all of its Document objects will be created.
pre_model : LDAModel, optional
This is needed when you want to train a tomotopy LDA model with word priors. Default is None.
model_type : {"tp_lda", "tp_hdp", "tp_lda_wp"}
Type of model to train; corresponds to the model type listed in the relevant ignis.models class.
model_options : dict, optional
Dictionary of options that will be passed to the relevant ignis.models model constructor.
labeller_type : {"tomotopy"}, optional
The type of automated labeller to use, if available.
labeller_options : dict, optional
Dictionary of options that will be passed to the relevant ignis.labeller object constructor.
vis_type : {"pyldavis"}, optional
The type of visualisation data to extract, if available.
vis_options : dict, optional
Dictionary of options that will be passed to the relevant ignis.vis object constructor.

Returns

Aurum
The Aurum results object for the trained model, which can be used for further exploration and iteration.
Expand source code
def train_model(
    corpus_slice,
    pre_model=None,
    model_type="tp_lda",
    model_options=None,
    labeller_type=None,
    labeller_options=None,
    vis_type="pyldavis",
    vis_options=None,
):
    """
    Top-level helper for training topic models using the various algorithms available.

    Parameters
    ----------
    corpus_slice: ignis.corpus.Corpus or ignis.corpus.CorpusSlice
        The `ignis.corpus.CorpusSlice` to perform the topic modelling over.  If a
        `ignis.corpus.Corpus` is passed instead, a `ignis.corpus.CorpusSlice`
        containing all of its `ignis.corpus.Document` objects will be created.
    pre_model: ignis.models.lda.LDAModel, optional
        This is needed when you want to train a `tomotopy` LDA model with word priors.
        Default is `None`.
    model_type: {"tp_lda", "tp_hdp", "tp_lda_wp"}
        Type of model to train; corresponds to the model type listed in the relevant
        `ignis.models` class.
    model_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.models`
        model constructor.
    labeller_type: {"tomotopy"}, optional
        The type of automated labeller to use, if available.
    labeller_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.labeller`
        object constructor.
    vis_type: {"pyldavis"}, optional
        The type of visualisation data to extract, if available.
    vis_options: dict, optional
        Dictionary of options that will be passed to the relevant `ignis.vis`
        object constructor.

    Returns
    -------
    ignis.aurum.Aurum
        The `ignis.aurum.Aurum` results object for the trained model, which can be used
        for further exploration and iteration.
    """
    if type(corpus_slice) is ignis.corpus.Corpus:
        corpus_slice = corpus_slice.slice_full()
    if not type(corpus_slice) is ignis.corpus.CorpusSlice:
        raise ValueError(
            "Ignis models must be instantiated with Corpus or CorpusSlice instances."
        )

    if model_type == "tp_lda":
        model = ignis.models.LDAModel(corpus_slice, model_options)
        model.train()
        aurum = ignis.aurum.Aurum(model)
    elif model_type == "tp_lda_wp":
        # Tomotopy LDA model with word priors
        # (contrib.: C. Ow)
        if isinstance(pre_model, ignis.models.lda.LDAModel):
            model = pre_model
        else:
            raise ValueError(
                "Ignis models with word priors must be pre-instantiated "
                "`ignis.models.lda.LDAModel` instances."
            )
        model.train()
        aurum = ignis.aurum.Aurum(model)
    elif model_type == "tp_hdp":
        model = ignis.models.HDPModel(corpus_slice, model_options)
        model.train()
        aurum = ignis.aurum.Aurum(model)
    else:
        raise ValueError(f"Unknown model type: '{model_type}'")

    if labeller_type is not None:
        if labeller_options is None:
            labeller_options = {}

        aurum.init_labeller(labeller_type, **labeller_options)

    if vis_type is not None:
        if vis_options is None:
            vis_options = {}

        aurum.init_vis(vis_type, **vis_options)

    return aurum