Source code for textacy.extract.basics

"""
Basics
------

:mod:`textacy.extract.basics`: Extract basic components from a document or sentence
via spaCy, with bells and whistles for filtering the results.
"""
from __future__ import annotations

from functools import partial
from typing import Collection, Iterable, List, Optional, Set, Union

from cytoolz import itertoolz
from spacy.parts_of_speech import DET
from spacy.tokens import Span, Token

from .. import constants, errors, types, utils


[docs]def words( doclike: types.DocLike, *, filter_stops: bool = True, filter_punct: bool = True, filter_nums: bool = False, include_pos: Optional[str | Collection[str]] = None, exclude_pos: Optional[str | Collection[str]] = None, min_freq: int = 1, ) -> Iterable[Token]: """ Extract an ordered sequence of words from a document processed by spaCy, optionally filtering words by part-of-speech tag and frequency. Args: doclike filter_stops: If True, remove stop words from word list. filter_punct: If True, remove punctuation from word list. filter_nums: If True, remove number-like words (e.g. 10, "ten") from word list. include_pos: Remove words whose part-of-speech tag IS NOT in the specified tags. exclude_pos: Remove words whose part-of-speech tag IS in the specified tags. min_freq: Remove words that occur in ``doclike`` fewer than ``min_freq`` times. Yields: Next token from ``doclike`` passing specified filters in order of appearance in the document. Raises: TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, or a falsy value Note: Filtering by part-of-speech tag uses the universal POS tag set; for details, check spaCy's docs: https://spacy.io/api/annotation#pos-tagging """ words_: Iterable[Token] = (w for w in doclike if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if include_pos: include_pos = utils.to_collection(include_pos, str, set) include_pos = {pos.upper() for pos in include_pos} words_ = (w for w in words_ if w.pos_ in include_pos) if exclude_pos: exclude_pos = utils.to_collection(exclude_pos, str, set) exclude_pos = {pos.upper() for pos in exclude_pos} words_ = (w for w in words_ if w.pos_ not in exclude_pos) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(w.lower_ for w in words_) words_ = (w for w in words_ if freqs[w.lower_] >= min_freq) for word in words_: yield word
[docs]def ngrams( doclike: types.DocLike, n: int | Collection[int], *, filter_stops: bool = True, filter_punct: bool = True, filter_nums: bool = False, include_pos: Optional[str | Collection[str]] = None, exclude_pos: Optional[str | Collection[str]] = None, min_freq: int = 1, ) -> Iterable[Span]: """ Extract an ordered sequence of n-grams (``n`` consecutive tokens) from a spaCy ``Doc`` or ``Span``, for one or multiple ``n`` values, optionally filtering n-grams by the types and parts-of-speech of the constituent tokens. Args: doclike n: Number of tokens included per n-gram; for example, ``2`` yields bigrams and ``3`` yields trigrams. If multiple values are specified, then the collections of n-grams are concatenated together; for example, ``(2, 3)`` yields bigrams and then trigrams. filter_stops: If True, remove ngrams that start or end with a stop word. filter_punct: If True, remove ngrams that contain any punctuation-only tokens. filter_nums: If True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten'). include_pos: Remove ngrams if any constituent tokens' part-of-speech tags ARE NOT included in this param. exclude_pos: Remove ngrams if any constituent tokens' part-of-speech tags ARE included in this param. min_freq: Remove ngrams that occur in ``doclike`` fewer than ``min_freq`` times Yields: Next ngram from ``doclike`` passing all specified filters, in order of appearance in the document. Raises: ValueError: if any ``n`` < 1 TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, or a falsy value Note: Filtering by part-of-speech tag uses the universal POS tag set; for details, check spaCy's docs: https://spacy.io/api/annotation#pos-tagging """ ns = utils.to_collection(n, int, tuple) if any(n_ < 1 for n_ in ns): raise ValueError("n must be greater than or equal to 1") if include_pos: include_pos = { pos.upper() for pos in utils.to_collection(include_pos, str, set) } if exclude_pos: exclude_pos = { pos.upper() for pos in utils.to_collection(exclude_pos, str, set) } for n_ in ns: ngrams_ = (doclike[i : i + n_] for i in range(len(doclike) - n_ + 1)) ngrams_ = (ng for ng in ngrams_ if not any(w.is_space for w in ng)) if filter_stops is True: ngrams_ = (ng for ng in ngrams_ if not ng[0].is_stop and not ng[-1].is_stop) if filter_punct is True: ngrams_ = (ng for ng in ngrams_ if not any(w.is_punct for w in ng)) if filter_nums is True: ngrams_ = (ng for ng in ngrams_ if not any(w.like_num for w in ng)) if include_pos: ngrams_ = (ng for ng in ngrams_ if all(w.pos_ in include_pos for w in ng)) if exclude_pos: ngrams_ = (ng for ng in ngrams_ if not any(w.pos_ in exclude_pos for w in ng)) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(ng.text.lower() for ng in ngrams_) ngrams_ = (ng for ng in ngrams_ if freqs[ng.text.lower()] >= min_freq) for ngram in ngrams_: yield ngram
[docs]def entities( doclike: types.DocLike, *, include_types: Optional[str | Collection[str]] = None, exclude_types: Optional[str | Collection[str]] = None, drop_determiners: bool = True, min_freq: int = 1, ) -> Iterable[Span]: """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a ``Doc``, optionally filtering by entity types and frequencies. Args: doclike include_types: Remove entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types: Remove entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners: Remove leading determiners (e.g. "the") from entities (e.g. "the United States" => "United States"). .. note:: Entities from which a leading determiner has been removed are, effectively, *new* entities, and not saved to the ``Doc`` from which they came. This is irritating but unavoidable, since this function is not meant to have side-effects on document state. If you're only using the text of the returned spans, this is no big deal, but watch out if you're counting on determiner-less entities associated with the doc downstream. min_freq: Remove entities that occur in ``doclike`` fewer than ``min_freq`` times Yields: Next entity from ``doclike`` passing all specified filters in order of appearance in the document Raises: TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of str, or a falsy value """ ents = doclike.ents # HACK: spacy's models have been erroneously tagging whitespace as entities # https://github.com/explosion/spaCy/commit/1e6725e9b734862e61081a916baf440697b9971e ents = (ent for ent in ents if not ent.text.isspace()) include_types = _parse_ent_types(include_types, "include") exclude_types = _parse_ent_types(exclude_types, "exclude") if include_types: if isinstance(include_types, str): ents = (ent for ent in ents if ent.label_ == include_types) elif isinstance(include_types, (set, frozenset, list, tuple)): ents = (ent for ent in ents if ent.label_ in include_types) if exclude_types: if isinstance(exclude_types, str): ents = (ent for ent in ents if ent.label_ != exclude_types) elif isinstance(exclude_types, (set, frozenset, list, tuple)): ents = (ent for ent in ents if ent.label_ not in exclude_types) if drop_determiners is True: ents = ( ent if ent[0].pos != DET else Span( ent.doc, ent.start + 1, ent.end, label=ent.label, vector=ent.vector ) for ent in ents ) if min_freq > 1: ents = list(ents) freqs = itertoolz.frequencies(ent.text.lower() for ent in ents) ents = (ent for ent in ents if freqs[ent.text.lower()] >= min_freq) for ent in ents: yield ent
def _parse_ent_types( ent_types: Optional[str | Collection[str]], which: str, ) -> Optional[str | Set[str]]: if not ent_types: return None elif isinstance(ent_types, str): ent_types = ent_types.upper() # replace the shorthand numeric case by its corresponding constant if ent_types == "NUMERIC": return constants.NUMERIC_ENT_TYPES else: return ent_types elif isinstance(ent_types, (set, frozenset, list, tuple)): ent_types = {ent_type.upper() for ent_type in ent_types} # again, replace the shorthand numeric case by its corresponding constant # and include it in the set in case other types are specified if any(ent_type == "NUMERIC" for ent_type in ent_types): return ent_types.union(constants.NUMERIC_ENT_TYPES) else: return ent_types else: raise TypeError( errors.type_invalid_msg( f"{which}_types", type(ent_types), Optional[Union[str, Collection[str]]] ) )
[docs]def noun_chunks( doclike: types.DocLike, *, drop_determiners: bool = True, min_freq: int = 1, ) -> Iterable[Span]: """ Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally filtering by frequency and dropping leading determiners. Args: doclike drop_determiners: Remove leading determiners (e.g. "the") from phrases (e.g. "the quick brown fox" => "quick brown fox") min_freq: Remove chunks that occur in ``doclike`` fewer than ``min_freq`` times Yields: Next noun chunk from ``doclike`` in order of appearance in the document """ ncs = doclike.noun_chunks if drop_determiners is True: ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs) if min_freq > 1: ncs = list(ncs) freqs = itertoolz.frequencies(nc.text.lower() for nc in ncs) ncs = (nc for nc in ncs if freqs[nc.text.lower()] >= min_freq) for nc in ncs: yield nc
[docs]def terms( doclike: types.DocLike, *, ngs: Optional[int | Collection[int] | types.DocLikeToSpans] = None, ents: Optional[bool | types.DocLikeToSpans] = None, ncs: Optional[bool | types.DocLikeToSpans] = None, dedupe: bool = True, ) -> Iterable[Span]: """ Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks -- from ``doclike`` as a single, concatenated collection, with optional deduplication of spans extracted by more than one type. .. code-block:: pycon >>> extract.terms(doc, ngs=2, ents=True, ncs=True) >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2)) >>> extract.terms(doc, ents=extract.entities) >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON")) Args: doclike ngs: N-gram terms to be extracted. If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is used to extract terms; if a callable, ``ngs(doclike)`` is used to extract terms; if None, no n-gram terms are extracted. ents: Entity terms to be extracted. If True, :func:`textacy.extract.entities(doclike)` is used to extract terms; if a callable, ``ents(doclike)`` is used to extract terms; if None, no entity terms are extracted. ncs: Noun chunk terms to be extracted. If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract terms; if a callable, ``ncs(doclike)`` is used to extract terms; if None, no noun chunk terms are extracted. dedupe: If True, deduplicate terms whose spans are extracted by multiple types (e.g. a span that is both an n-gram and an entity), as identified by identical (start, stop) indexes in ``doclike``; otherwise, don't. Returns: Next term from ``doclike``, in order of n-grams then entities then noun chunks, with each collection's terms given in order of appearance. Note: This function is *not* to be confused with keyterm extraction, which leverages statistics and algorithms to quantify the "key"-ness of terms before returning the top-ranking terms. There is no such scoring or ranking here. See Also: - :func:`textacy.extact.ngrams()` - :func:`textacy.extact.entities()` - :func:`textacy.extact.noun_chunks()` - :mod:`textacy.extact.keyterms` """ extractors = _get_extractors(ngs, ents, ncs) terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors) if dedupe is True: terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end)) for term in terms_: yield term
def _get_extractors(ngs, ents, ncs) -> List[types.DocLikeToSpans]: all_extractors = [ _get_ngs_extractor(ngs), _get_ents_extractor(ents), _get_ncs_extractor(ncs) ] extractors = [extractor for extractor in all_extractors if extractor is not None] if not extractors: raise ValueError("at least one term extractor must be specified") else: return extractors def _get_ngs_extractor(ngs) -> Optional[types.DocLikeToSpans]: if ngs is None: return None elif callable(ngs): return ngs elif ( isinstance(ngs, int) or (isinstance(ngs, Collection) and all(isinstance(ng, int) for ng in ngs)) ): return partial(ngrams, n=ngs) else: raise TypeError() def _get_ents_extractor(ents) -> Optional[types.DocLikeToSpans]: if ents is None: return None elif callable(ents): return ents elif isinstance(ents, bool): return entities else: raise TypeError() def _get_ncs_extractor(ncs) -> Optional[types.DocLikeToSpans]: if ncs is None: return None elif callable(ncs): return ncs elif isinstance(ncs, bool): return noun_chunks else: raise TypeError()