Source code for textacy.extract.keyterms.textrank

from __future__ import annotations

import collections
from operator import itemgetter
from typing import Callable, Collection, Dict, List, Optional, Set, Tuple

from spacy.tokens import Doc, Token

from ... import representations, utils
from .. import utils as ext_utils


[docs]def textrank( doc: Doc, *, normalize: Optional[str | Callable[[Token], str]] = "lemma", include_pos: Optional[str | Collection[str]] = ("NOUN", "PROPN", "ADJ"), window_size: int = 2, edge_weighting: str = "binary", position_bias: bool = False, topn: int | float = 10, ) -> List[Tuple[str, float]]: """ Extract key terms from a document using the TextRank algorithm, or a variation thereof. For example: - TextRank: ``window_size=2, edge_weighting="binary", position_bias=False`` - SingleRank: ``window_size=10, edge_weighting="count", position_bias=False`` - PositionRank: ``window_size=10, edge_weighting="count", position_bias=True`` Args: doc: spaCy ``Doc`` from which to extract keyterms. normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None, use the form of terms as they appeared in ``doc``; if a callable, must accept a ``Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. include_pos: One or more POS tags with which to filter for good candidate keyterms. If None, include tokens of all POS tags (which also allows keyterm extraction from docs without POS-tagging.) window_size: Size of sliding window in which term co-occurrences are determined. edge_weighting ({"count", "binary"}): : If "count", the nodes for all co-occurring terms are connected by edges with weight equal to the number of times they co-occurred within a sliding window; if "binary", all such edges have weight = 1. position_bias: If True, bias the PageRank algorithm for weighting nodes in the word graph, such that words appearing earlier and more frequently in ``doc`` tend to get larger weights. topn: Number of top-ranked terms to return as key terms. If an integer, represents the absolute number; if a float, value must be in the interval (0.0, 1.0], which is converted to an int by ``int(round(len(set(candidates)) * topn))``. Returns: Sorted list of top ``topn`` key terms and their corresponding TextRank ranking scores. References: - Mihalcea, R., & Tarau, P. (2004, July). TextRank: Bringing order into texts. Association for Computational Linguistics. - Wan, Xiaojun and Jianguo Xiao. 2008. Single document keyphrase extraction using neighborhood knowledge. In Proceedings of the 23rd AAAI Conference on Artificial Intelligence, pages 855–860. - Florescu, C. and Cornelia, C. (2017). PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents. In proceedings of ACL*, pages 1105-1115. """ # validate / transform args include_pos = utils.to_collection(include_pos, str, set) if isinstance(topn, float): if not 0.0 < topn <= 1.0: raise ValueError( "topn={} is invalid; " "must be an int, or a float between 0.0 and 1.0".format(topn) ) # bail out on empty docs if not doc: return [] word_pos: Optional[Dict[str, float]] if position_bias is True: word_pos = collections.defaultdict(float) for word, norm_word in zip(doc, ext_utils.terms_to_strings(doc, normalize)): word_pos[norm_word] += 1 / (word.i + 1) sum_word_pos = sum(word_pos.values()) word_pos = {word: pos / sum_word_pos for word, pos in word_pos.items()} else: word_pos = None # build a graph from all words in doc, then score them graph = representations.network.build_cooccurrence_network( list(ext_utils.terms_to_strings(doc, normalize)), window_size=window_size, edge_weighting=edge_weighting, ) word_scores = representations.network.rank_nodes_by_pagerank( graph, weight="weight", personalization=word_pos ) # generate a list of candidate terms candidates = _get_candidates(doc, normalize, include_pos) if isinstance(topn, float): topn = int(round(len(set(candidates)) * topn)) # rank candidates by aggregating constituent word scores candidate_scores = { " ".join(candidate): sum(word_scores.get(word, 0.0) for word in candidate) for candidate in candidates } sorted_candidate_scores = sorted( candidate_scores.items(), key=itemgetter(1, 0), reverse=True ) return ext_utils.get_filtered_topn_terms( sorted_candidate_scores, topn, match_threshold=0.8 )
def _get_candidates( doc: Doc, normalize: Optional[str | Callable], include_pos: Optional[Set[str]], ) -> Set[Tuple[str, ...]]: """ Get a set of candidate terms to be scored by joining the longest subsequences of valid words -- non-stopword and non-punct, filtered to nouns, proper nouns, and adjectives if ``doc`` is POS-tagged -- then normalized into strings. """ def _is_valid_tok(tok): return not (tok.is_stop or tok.is_punct or tok.is_space) and ( include_pos is None or tok.pos_ in include_pos ) candidates = ext_utils.get_longest_subsequence_candidates(doc, _is_valid_tok) return { tuple(ext_utils.terms_to_strings(candidate, normalize)) for candidate in candidates }