Source code for textacy.extract.keyterms.yake

from __future__ import annotations

import collections
import functools
import math
import operator
import statistics
from typing import Collection, Dict, Iterable, List, Optional, Set, Tuple

from cytoolz import itertoolz
from spacy.tokens import Doc, Token

from ... import errors, utils
from .. import utils as ext_utils


[docs]def yake(
    doc: Doc,
    *,
    normalize: Optional[str] = "lemma",
    ngrams: int | Collection[int] = (1, 2, 3),
    include_pos: Optional[str | Collection[str]] = ("NOUN", "PROPN", "ADJ"),
    window_size: int = 2,
    topn: int | float = 10,
) -> List[Tuple[str, float]]:
    """
    Extract key terms from a document using the YAKE algorithm.

    Args:
        doc: spaCy ``Doc`` from which to extract keyterms.
            Must be sentence-segmented; optionally POS-tagged.
        normalize: If "lemma", lemmatize terms; if "lower", lowercase terms;
            if None, use the form of terms as they appeared in ``doc``.

            .. note:: Unlike the other keyterm extraction functions, this one
               doesn't accept a callable for ``normalize``.

        ngrams: n of which n-grams to consider as keyterm candidates.
            For example, `(1, 2, 3)`` includes all unigrams, bigrams, and trigrams,
            while ``2`` includes bigrams only.
        include_pos: One or more POS tags with which to filter for good candidate keyterms.
            If None, include tokens of all POS tags
            (which also allows keyterm extraction from docs without POS-tagging.)
        window_size: Number of words to the right and left of a given word
            to use as context when computing the "relatedness to context"
            component of its score. Note that the resulting sliding window's
            full width is ``1 + (2 * window_size)``.
        topn: Number of top-ranked terms to return as key terms.
            If an integer, represents the absolute number; if a float, value
            must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(candidates) * topn))``

    Returns:
        Sorted list of top ``topn`` key terms and their corresponding YAKE scores.

    References:
        Campos, Mangaravite, Pasquali, Jorge, Nunes, and Jatowt. (2018).
        A Text Feature Based Automatic Keyword Extraction Method for Single Documents.
        Advances in Information Retrieval. ECIR 2018.
        Lecture Notes in Computer Science, vol 10772, pp. 684-691.
    """
    # validate / transform args
    ngrams = utils.to_collection(ngrams, int, tuple)
    include_pos = utils.to_collection(include_pos, str, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                f"topn = {topn} is invalid; "
                "must be an int, or a float between 0.0 and 1.0"
            )

    # bail out on empty docs
    if not doc:
        return []

    stop_words: Set[str] = set()
    seen_candidates: Set[str] = set()
    # compute key values on a per-word basis
    word_occ_vals = _get_per_word_occurrence_values(
        doc, normalize, stop_words, window_size
    )
    # doc doesn't have any words...
    if not word_occ_vals:
        return []

    word_freqs = {w_id: len(vals["is_uc"]) for w_id, vals in word_occ_vals.items()}
    word_scores = _compute_word_scores(doc, word_occ_vals, word_freqs, stop_words)
    # compute scores for candidate terms based on scores of constituent words
    term_scores: Dict[str, float] = {}
    # do single-word candidates separately; it's faster and simpler
    if 1 in ngrams:
        candidates = _get_unigram_candidates(doc, include_pos)
        _score_unigram_candidates(
            candidates,
            word_freqs,
            word_scores,
            term_scores,
            stop_words,
            seen_candidates,
            normalize,
        )
    # now compute combined scores for higher-n ngram and candidates
    candidates = list(
        ext_utils.get_ngram_candidates(
            doc, [n for n in ngrams if n > 1], include_pos=include_pos,
        )
    )
    attr_name = _get_attr_name(normalize, True)
    ngram_freqs = itertoolz.frequencies(
        " ".join(getattr(word, attr_name) for word in ngram) for ngram in candidates
    )
    _score_ngram_candidates(
        candidates, ngram_freqs, word_scores, term_scores, seen_candidates, normalize,
    )
    # build up a list of key terms in order of increasing score
    if isinstance(topn, float):
        topn = int(round(len(seen_candidates) * topn))
    sorted_term_scores = sorted(
        term_scores.items(), key=operator.itemgetter(1), reverse=False,
    )
    return ext_utils.get_filtered_topn_terms(
        sorted_term_scores, topn, match_threshold=0.8
    )


def _get_attr_name(normalize: Optional[str], as_strings: bool) -> str:
    if normalize is None:
        attr_name = "norm"
    elif normalize in ("lemma", "lower"):
        attr_name = normalize
    else:
        raise ValueError(
            errors.value_invalid_msg("normalize", normalize, {"lemma", "lower", None})
        )
    if as_strings is True:
        attr_name = attr_name + "_"
    return attr_name


def _get_per_word_occurrence_values(
    doc: Doc, normalize: Optional[str], stop_words: Set[str], window_size: int,
) -> Dict[int, Dict[str, list]]:
    """
    Get base values for each individual occurrence of a word, to be aggregated
    and combined into a per-word score.
    """
    word_occ_vals = collections.defaultdict(lambda: collections.defaultdict(list))

    def _is_upper_cased(tok):
        return tok.is_upper or (tok.is_title and not tok.is_sent_start)

    attr_name = _get_attr_name(normalize, False)
    padding = [None] * window_size
    for sent_idx, sent in enumerate(doc.sents):
        sent_padded = itertoolz.concatv(padding, sent, padding)
        for window in itertoolz.sliding_window(1 + (2 * window_size), sent_padded):
            lwords, word, rwords = (
                window[:window_size],
                window[window_size],
                window[window_size + 1 :],
            )
            w_id = getattr(word, attr_name)
            if word.is_stop:
                stop_words.add(w_id)
            word_occ_vals[w_id]["is_uc"].append(_is_upper_cased(word))
            word_occ_vals[w_id]["sent_idx"].append(sent_idx)
            word_occ_vals[w_id]["l_context"].extend(
                getattr(w, attr_name)
                for w in lwords
                if not (w is None or w.is_punct or w.is_space)
            )
            word_occ_vals[w_id]["r_context"].extend(
                getattr(w, attr_name)
                for w in rwords
                if not (w is None or w.is_punct or w.is_space)
            )
    return word_occ_vals


def _compute_word_scores(
    doc: Doc,
    word_occ_vals: Dict[int, Dict[str, list]],
    word_freqs: Dict[int, int],
    stop_words: Set[str],
) -> Dict[int, float]:
    """
    Aggregate values from per-word occurrence values, compute per-word weights
    of several components, then combine components into per-word scores.
    """
    word_weights = collections.defaultdict(dict)
    # compute summary stats for word frequencies
    freqs_nsw = [freq for w_id, freq in word_freqs.items() if w_id not in stop_words]
    freq_max = max(word_freqs.values())
    freq_baseline = statistics.mean(freqs_nsw) + statistics.stdev(freqs_nsw)
    n_sents = itertoolz.count(doc.sents)
    for w_id, vals in word_occ_vals.items():
        freq = word_freqs[w_id]
        word_weights[w_id]["case"] = sum(vals["is_uc"]) / math.log2(1 + freq)
        word_weights[w_id]["pos"] = math.log2(
            math.log2(3 + statistics.mean(vals["sent_idx"]))
        )
        word_weights[w_id]["freq"] = freq / freq_baseline
        word_weights[w_id]["disp"] = len(set(vals["sent_idx"])) / n_sents
        n_unique_lc = len(set(vals["l_context"]))
        n_unique_rc = len(set(vals["r_context"]))
        try:
            wl = n_unique_lc / len(vals["l_context"])
        except ZeroDivisionError:
            wl = 0.0
        try:
            wr = n_unique_rc / len(vals["r_context"])
        except ZeroDivisionError:
            wr = 0.0
        pl = n_unique_lc / freq_max
        pr = n_unique_rc / freq_max
        word_weights[w_id]["rel"] = 1.0 + (wl + wr) * (freq / freq_max) + pl + pr

    # combine individual weights into per-word scores
    word_scores = {
        w_id: (wts["rel"] * wts["pos"])
        / (wts["case"] + (wts["freq"] / wts["rel"]) + (wts["disp"] / wts["rel"]))
        for w_id, wts in word_weights.items()
    }
    return word_scores


def _get_unigram_candidates(doc: Doc, include_pos: Set[str]) -> Iterable[Token]:
    candidates = (
        word for word in doc if not (word.is_stop or word.is_punct or word.is_space)
    )
    if include_pos:
        candidates = (word for word in candidates if word.pos_ in include_pos)
    return candidates


def _score_unigram_candidates(
    candidates: Iterable[Token],
    word_freqs: Dict[int, int],
    word_scores: Dict[int, float],
    term_scores: Dict[str, float],
    stop_words: Set[str],
    seen_candidates: Set[str],
    normalize: Optional[str],
):
    attr_name = _get_attr_name(normalize, False)
    attr_name_str = _get_attr_name(normalize, True)
    for word in candidates:
        w_id = getattr(word, attr_name)
        if w_id in stop_words or w_id in seen_candidates:
            continue
        else:
            seen_candidates.add(w_id)
        # NOTE: here i've modified the YAKE algorithm to put less emphasis on term freq
        # term_scores[word.lower_] = word_scores[w_id] / (word_freqs[w_id] * (1 + word_scores[w_id]))
        term_scores[getattr(word, attr_name_str)] = word_scores[w_id] / (
            math.log2(1 + word_freqs[w_id]) * (1 + word_scores[w_id])
        )


def _score_ngram_candidates(
    candidates: List[Tuple[Token, ...]],
    ngram_freqs: Dict[str, int],
    word_scores: Dict[int, float],
    term_scores: Dict[str, float],
    seen_candidates: Set[str],
    normalize: Optional[str],
):
    attr_name = _get_attr_name(normalize, False)
    attr_name_str = _get_attr_name(normalize, True)
    for ngram in candidates:
        ngtxt = " ".join(getattr(word, attr_name_str) for word in ngram)
        if ngtxt in seen_candidates:
            continue
        else:
            seen_candidates.add(ngtxt)
        ngram_word_scores = [word_scores[getattr(word, attr_name)] for word in ngram]
        # multiply individual word scores together in the numerator
        numerator = functools.reduce(operator.mul, ngram_word_scores, 1.0)
        # NOTE: here i've modified the YAKE algorithm to put less emphasis on term freq
        # denominator = ngram_freqs[ngtxt] * (1.0 + sum(ngram_word_scores))
        denominator = math.log2(1 + ngram_freqs[ngtxt]) * (1.0 + sum(ngram_word_scores))
        term_scores[ngtxt] = numerator / denominator
Source code for textacy.extract.keyterms.yake

Navigation

Related Topics