Source code for textacy.extract.acros

"""
Acronyms
--------

:mod:`textacy.extract.acronyms`: Extract acronyms and their definitions from a document
or sentence through rule-based pattern-matching of the annotated tokens.
"""
from __future__ import annotations

import collections
from operator import itemgetter
from typing import Dict, Iterable, List, Optional, Set, Tuple

import numpy as np
from spacy.tokens import Span, Token

from .. import constants, types


[docs]def acronyms(doclike: types.DocLike) -> Iterable[Token]: """ Extract tokens whose text is "acronym-like" from a document or sentence, in order of appearance. Args: doclike Yields: Next acronym-like ``Token``. """ for tok in doclike: if is_acronym(tok.text): yield tok
[docs]def acronyms_and_definitions( doclike: types.DocLike, known_acro_defs: Optional[Dict[str, str]] = None, ) -> Dict[str, List[str]]: """ Extract a collection of acronyms and their most likely definitions, if available, from a spacy-parsed doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. Args: doclike known_acro_defs: If certain acronym/definition pairs are known, pass them in as {acronym (str): definition (str)}; algorithm will not attempt to find new definitions Returns: Unique acronyms (keys) with matched definitions (values) References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ # process function arguments acro_defs: Dict[str, List[Tuple[str, float]]] = collections.defaultdict(list) if not known_acro_defs: known_acronyms = set() else: for acro, def_ in known_acro_defs.items(): acro_defs[acro] = [(def_, 1.0)] known_acronyms = set(acro_defs.keys()) if isinstance(doclike, Span): sents = [doclike] else: # spacy.Doc sents = doclike.sents # iterate over sentences and their tokens for sent in sents: max_ind = len(sent) - 1 for i, token in enumerate(sent): token_ = token.text if token_ in known_acronyms or is_acronym(token_) is False: continue # define definition search window(s) window_size = min(2 * len(token_), len(token_) + 5) windows = [ sent[max(i - window_size, 0) : i], sent[min(i + 1, max_ind) : min(i + window_size + 1, max_ind)], ] # if candidate inside (X) or -X-, only look in pre-window if 0 < i < max_ind: adjacent_tokens = sent[i - 1].text + sent[i + 1].text if adjacent_tokens in {"()", "--", "––"}: windows.pop() # iterate over possible windows # filtering for valid definition strings for window in windows: window_ = window.text # window text can't be all uppercase if window_.isupper(): continue # window can't contain separating punctuation if "!" in window_ or "?" in window_ or ":" in window_ or ";" in window_: continue # acronym definition can't contain itself: no ouroboros! if token_ in window_: continue # window must contain at least one character used in acronym if not any(char in window_ for char in token_): continue definition, confidence = _get_acronym_definition( token_, window, threshold=0.8 ) if definition: acro_defs[token_].append((definition, confidence)) if not acro_defs.get(token_): acro_defs[token_].append(("", 0.0)) # vote by confidence score in the case of multiple definitions for acro, defs in acro_defs.items(): if len(defs) == 1: acro_defs[acro] = defs[0][0] else: acro_defs[acro] = sorted(defs, key=itemgetter(1), reverse=True)[0][0] return dict(acro_defs)
def _get_acronym_definition( acronym: str, window: Span, threshold: float = 0.8, ) -> Tuple[str, float]: """ Identify most likely definition for an acronym given a list of tokens. Args: acronym: acronym for which definition is sought window: a span of tokens from which definition extraction will be attempted threshold: minimum "confidence" in definition required for acceptance; valid values in [0.0, 1.0]; higher value => stricter threshold Returns: Most likely definition for given acronym ('' if none found), along with the confidence assigned to it References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ def build_lcs_matrix(X, Y): m = len(X) n = len(Y) b = np.zeros((m, n), dtype=int) c = np.zeros((m, n), dtype=int) for i in range(0, m): for j in range(0, n): if X[i] == Y[j]: c[i, j] = c[i - 1, j - 1] + 1 b[i, j] = 1 elif c[i - 1, j] >= c[i, j - 1]: c[i, j] = c[i - 1, j] else: c[i, j] = c[i, j - 1] return c, b def parse_lcs_matrix(b, start_i, start_j, lcs_length, stack, vectors): m = b.shape[0] n = b.shape[1] for i in range(start_i, m): for j in range(start_j, n): if b[i, j] == 1: s = (i, j) stack.append(s) if lcs_length == 1: vec = [np.NaN] * n for k, l in stack: vec[l] = k vectors.append(vec) else: parse_lcs_matrix(b, i + 1, j + 1, lcs_length - 1, stack, vectors) stack = [] return vectors def vector_values(v, types): vv = {} first = v.index(int(np.nanmin(v))) last = v.index(int(np.nanmax(v))) vv["size"] = (last - first) + 1 vv["distance"] = len(v) - last vv["stop_count"] = 0 vv["misses"] = 0 for i in range(first, last + 1): if v[i] >= 0 and types[i] == "s": vv["stop_count"] += 1 elif v[i] is None and types[i] not in ["s", "h"]: vv["misses"] += 1 return vv def compare_vectors(A, B, types): vv_A = vector_values(A, types) vv_B = vector_values(B, types) # no one-letter matches, sorryboutit if vv_A["size"] == 1: return B elif vv_B["size"] == 1: return A if vv_A["misses"] > vv_B["misses"]: return B elif vv_A["misses"] < vv_B["misses"]: return A if vv_A["stop_count"] > vv_B["stop_count"]: return B if vv_A["stop_count"] < vv_B["stop_count"]: return A if vv_A["distance"] > vv_B["distance"]: return B elif vv_A["distance"] < vv_B["distance"]: return A if vv_A["size"] > vv_B["size"]: return B elif vv_A["size"] < vv_B["size"]: return A return A # get definition window's leading characters and word types def_leads = [] def_types = [] for tok in window: tok_text = tok.text if tok.is_stop: def_leads.append(tok_text[0]) def_types.append("s") elif is_acronym(tok_text): def_leads.append(tok_text[0]) def_types.append("a") elif "-" in tok_text and not tok_text.startswith("-"): tok_split = [t[0] for t in tok_text.split("-") if t] def_leads.extend(tok_split) def_types.extend("H" if i == 0 else "h" for i in range(len(tok_split))) else: def_leads.append(tok_text[0]) def_types.append("w") def_leads = "".join(def_leads).lower() def_types = "".join(def_types) # extract alphanumeric characters from acronym acr_leads = "".join(c for c in acronym if c.isalnum()) # handle special cases of '&' and trailing 's' acr_leads = acr_leads.replace("&", "a") if acr_leads.endswith("s"): # bail out if it's only a 2-letter acronym to start with, e.g. 'Is' if len(acr_leads) == 2: return ("", 0) acr_leads = acr_leads[:-1] acr_leads = acr_leads.lower() c, b = build_lcs_matrix(acr_leads, def_leads) # 4.4.1 lcs_length = c[c.shape[0] - 1, c.shape[1] - 1] confidence = lcs_length / len(acronym) if confidence < threshold: return ("", confidence) vecs = parse_lcs_matrix(b, 0, 0, lcs_length, [], []) # first letter of acronym must be present vecs = [vec for vec in vecs if 0 in vec] if not vecs: return ("", confidence) best_vec = vecs[0] for vec in vecs[1:]: best_vec = compare_vectors(best_vec, vec, def_types) first = best_vec.index(int(np.nanmin(best_vec))) last = best_vec.index(int(np.nanmax(best_vec))) definition = window[first : last + 1].text if len(definition.split()) == 1: return ("", confidence) return (definition, confidence)
[docs]def is_acronym(token: str, exclude: Optional[Set[str]] = None) -> bool: """ Pass single token as a string, return True/False if is/is not valid acronym. Args: token: Single word to check for acronym-ness exclude: If technically valid but not actual acronyms are known in advance, pass them in as a set of strings; matching tokens will return False. Returns: Whether or not ``token`` is an acronym. """ # exclude certain valid acronyms from consideration if exclude and token in exclude: return False # don't allow empty strings if not token: return False # don't allow spaces if " " in token: return False # 2-character acronyms can't have lower-case letters if len(token) == 2 and not token.isupper(): return False # acronyms can't be all digits if token.isdigit(): return False # acronyms must have at least one upper-case letter or start/end with a digit if not any(char.isupper() for char in token) and not ( token[0].isdigit() or token[-1].isdigit() ): return False # acronyms must have between 2 and 10 alphanumeric characters if not 2 <= sum(1 for char in token if char.isalnum()) <= 10: return False # only certain combinations of letters, digits, and '&/.-' allowed if not constants.RE_ACRONYM.match(token): return False return True