Source code for textacy.extract.keyterms.textrank

from __future__ import annotations

import collections
from operator import itemgetter
from typing import Callable, Collection, Dict, List, Optional, Set, Tuple

from spacy.tokens import Doc, Token

from ... import representations, utils
from .. import utils as ext_utils


[docs]def textrank(
    doc: Doc,
    *,
    normalize: Optional[str | Callable[[Token], str]] = "lemma",
    include_pos: Optional[str | Collection[str]] = ("NOUN", "PROPN", "ADJ"),
    window_size: int = 2,
    edge_weighting: str = "binary",
    position_bias: bool = False,
    topn: int | float = 10,
) -> List[Tuple[str, float]]:
    """
    Extract key terms from a document using the TextRank algorithm, or
    a variation thereof. For example:

    - TextRank: ``window_size=2, edge_weighting="binary", position_bias=False``
    - SingleRank: ``window_size=10, edge_weighting="count", position_bias=False``
    - PositionRank: ``window_size=10, edge_weighting="count", position_bias=True``

    Args:
        doc: spaCy ``Doc`` from which to extract keyterms.
        normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None,
            use the form of terms as they appeared in ``doc``; if a callable,
            must accept a ``Token`` and return a str,
            e.g. :func:`textacy.spacier.utils.get_normalized_text()`.
        include_pos: One or more POS tags with which to filter for good candidate keyterms.
            If None, include tokens of all POS tags
            (which also allows keyterm extraction from docs without POS-tagging.)
        window_size: Size of sliding window in which term co-occurrences are determined.
        edge_weighting ({"count", "binary"}): : If "count", the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if "binary", all such edges have weight = 1.
        position_bias: If True, bias the PageRank algorithm for weighting
            nodes in the word graph, such that words appearing earlier and more
            frequently in ``doc`` tend to get larger weights.
        topn: Number of top-ranked terms to return as key terms.
            If an integer, represents the absolute number; if a float, value
            must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(set(candidates)) * topn))``.

    Returns:
        Sorted list of top ``topn`` key terms and their corresponding TextRank ranking scores.

    References:
        - Mihalcea, R., & Tarau, P. (2004, July). TextRank: Bringing order into texts.
          Association for Computational Linguistics.
        - Wan, Xiaojun and Jianguo Xiao. 2008. Single document keyphrase extraction
          using neighborhood knowledge. In Proceedings of the 23rd AAAI Conference
          on Artificial Intelligence, pages 855–860.
        - Florescu, C. and Cornelia, C. (2017). PositionRank: An Unsupervised Approach
          to Keyphrase Extraction from Scholarly Documents. In proceedings of ACL*,
          pages 1105-1115.
    """
    # validate / transform args
    include_pos = utils.to_collection(include_pos, str, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn)
            )

    # bail out on empty docs
    if not doc:
        return []

    word_pos: Optional[Dict[str, float]]
    if position_bias is True:
        word_pos = collections.defaultdict(float)
        for word, norm_word in zip(doc, ext_utils.terms_to_strings(doc, normalize)):
            word_pos[norm_word] += 1 / (word.i + 1)
        sum_word_pos = sum(word_pos.values())
        word_pos = {word: pos / sum_word_pos for word, pos in word_pos.items()}
    else:
        word_pos = None
    # build a graph from all words in doc, then score them
    graph = representations.network.build_cooccurrence_network(
        list(ext_utils.terms_to_strings(doc, normalize)),
        window_size=window_size,
        edge_weighting=edge_weighting,
    )
    word_scores = representations.network.rank_nodes_by_pagerank(
        graph, weight="weight", personalization=word_pos
    )
    # generate a list of candidate terms
    candidates = _get_candidates(doc, normalize, include_pos)
    if isinstance(topn, float):
        topn = int(round(len(set(candidates)) * topn))
    # rank candidates by aggregating constituent word scores
    candidate_scores = {
        " ".join(candidate): sum(word_scores.get(word, 0.0) for word in candidate)
        for candidate in candidates
    }
    sorted_candidate_scores = sorted(
        candidate_scores.items(), key=itemgetter(1, 0), reverse=True
    )
    return ext_utils.get_filtered_topn_terms(
        sorted_candidate_scores, topn, match_threshold=0.8
    )


def _get_candidates(
    doc: Doc, normalize: Optional[str | Callable], include_pos: Optional[Set[str]],
) -> Set[Tuple[str, ...]]:
    """
    Get a set of candidate terms to be scored by joining the longest
    subsequences of valid words -- non-stopword and non-punct, filtered to
    nouns, proper nouns, and adjectives if ``doc`` is POS-tagged -- then
    normalized into strings.
    """

    def _is_valid_tok(tok):
        return not (tok.is_stop or tok.is_punct or tok.is_space) and (
            include_pos is None or tok.pos_ in include_pos
        )

    candidates = ext_utils.get_longest_subsequence_candidates(doc, _is_valid_tok)
    return {
        tuple(ext_utils.terms_to_strings(candidate, normalize)) for candidate in candidates
    }
Source code for textacy.extract.keyterms.textrank

Navigation

Related Topics