Source code for textacy.extract.basics

"""
Basics
------

:mod:`textacy.extract.basics`: Extract basic components from a document or sentence
via spaCy, with bells and whistles for filtering the results.
"""
from __future__ import annotations

from functools import partial
from typing import Collection, Iterable, List, Optional, Set, Union

from cytoolz import itertoolz
from spacy.parts_of_speech import DET
from spacy.tokens import Span, Token

from .. import constants, errors, types, utils


[docs]def words(
    doclike: types.DocLike,
    *,
    filter_stops: bool = True,
    filter_punct: bool = True,
    filter_nums: bool = False,
    include_pos: Optional[str | Collection[str]] = None,
    exclude_pos: Optional[str | Collection[str]] = None,
    min_freq: int = 1,
) -> Iterable[Token]:
    """
    Extract an ordered sequence of words from a document processed by spaCy,
    optionally filtering words by part-of-speech tag and frequency.

    Args:
        doclike
        filter_stops: If True, remove stop words from word list.
        filter_punct: If True, remove punctuation from word list.
        filter_nums: If True, remove number-like words (e.g. 10, "ten")
            from word list.
        include_pos: Remove words whose part-of-speech tag IS NOT in the specified tags.
        exclude_pos: Remove words whose part-of-speech tag IS in the specified tags.
        min_freq: Remove words that occur in ``doclike`` fewer than ``min_freq`` times.

    Yields:
        Next token from ``doclike`` passing specified filters in order of appearance
        in the document.

    Raises:
        TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
            or a falsy value

    Note:
        Filtering by part-of-speech tag uses the universal POS tag set; for details,
        check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
    """
    words_: Iterable[Token] = (w for w in doclike if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if include_pos:
        include_pos = utils.to_collection(include_pos, str, set)
        include_pos = {pos.upper() for pos in include_pos}
        words_ = (w for w in words_ if w.pos_ in include_pos)
    if exclude_pos:
        exclude_pos = utils.to_collection(exclude_pos, str, set)
        exclude_pos = {pos.upper() for pos in exclude_pos}
        words_ = (w for w in words_ if w.pos_ not in exclude_pos)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(w.lower_ for w in words_)
        words_ = (w for w in words_ if freqs[w.lower_] >= min_freq)

    for word in words_:
        yield word


[docs]def ngrams(
    doclike: types.DocLike,
    n: int | Collection[int],
    *,
    filter_stops: bool = True,
    filter_punct: bool = True,
    filter_nums: bool = False,
    include_pos: Optional[str | Collection[str]] = None,
    exclude_pos: Optional[str | Collection[str]] = None,
    min_freq: int = 1,
) -> Iterable[Span]:
    """
    Extract an ordered sequence of n-grams (``n`` consecutive tokens) from a spaCy
    ``Doc`` or ``Span``, for one or multiple ``n`` values, optionally filtering n-grams
    by the types and parts-of-speech of the constituent tokens.

    Args:
        doclike
        n: Number of tokens included per n-gram; for example, ``2`` yields bigrams
            and ``3`` yields trigrams. If multiple values are specified, then the
            collections of n-grams are concatenated together; for example, ``(2, 3)``
            yields bigrams and then trigrams.
        filter_stops: If True, remove ngrams that start or end with a stop word.
        filter_punct: If True, remove ngrams that contain any punctuation-only tokens.
        filter_nums: If True, remove ngrams that contain any numbers
            or number-like tokens (e.g. 10, 'ten').
        include_pos: Remove ngrams if any constituent tokens' part-of-speech tags
            ARE NOT included in this param.
        exclude_pos: Remove ngrams if any constituent tokens' part-of-speech tags
            ARE included in this param.
        min_freq: Remove ngrams that occur in ``doclike`` fewer than ``min_freq`` times

    Yields:
        Next ngram from ``doclike`` passing all specified filters, in order of appearance
        in the document.

    Raises:
        ValueError: if any ``n`` < 1
        TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
            or a falsy value

    Note:
        Filtering by part-of-speech tag uses the universal POS tag set; for details,
        check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
    """
    ns = utils.to_collection(n, int, tuple)
    if any(n_ < 1 for n_ in ns):
        raise ValueError("n must be greater than or equal to 1")

    if include_pos:
        include_pos = {
            pos.upper() for pos in utils.to_collection(include_pos, str, set)
        }
    if exclude_pos:
        exclude_pos = {
            pos.upper() for pos in utils.to_collection(exclude_pos, str, set)
        }
    for n_ in ns:
        ngrams_ = (doclike[i : i + n_] for i in range(len(doclike) - n_ + 1))
        ngrams_ = (ng for ng in ngrams_ if not any(w.is_space for w in ng))
        if filter_stops is True:
            ngrams_ = (ng for ng in ngrams_ if not ng[0].is_stop and not ng[-1].is_stop)
        if filter_punct is True:
            ngrams_ = (ng for ng in ngrams_ if not any(w.is_punct for w in ng))
        if filter_nums is True:
            ngrams_ = (ng for ng in ngrams_ if not any(w.like_num for w in ng))
        if include_pos:
            ngrams_ = (ng for ng in ngrams_ if all(w.pos_ in include_pos for w in ng))
        if exclude_pos:
            ngrams_ = (ng for ng in ngrams_ if not any(w.pos_ in exclude_pos for w in ng))
        if min_freq > 1:
            ngrams_ = list(ngrams_)
            freqs = itertoolz.frequencies(ng.text.lower() for ng in ngrams_)
            ngrams_ = (ng for ng in ngrams_ if freqs[ng.text.lower()] >= min_freq)

        for ngram in ngrams_:
            yield ngram


[docs]def entities(
    doclike: types.DocLike,
    *,
    include_types: Optional[str | Collection[str]] = None,
    exclude_types: Optional[str | Collection[str]] = None,
    drop_determiners: bool = True,
    min_freq: int = 1,
) -> Iterable[Span]:
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a ``Doc``, optionally filtering by entity types and frequencies.

    Args:
        doclike
        include_types: Remove entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types: Remove entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners: Remove leading determiners (e.g. "the")
            from entities (e.g. "the United States" => "United States").

            .. note:: Entities from which a leading determiner has been removed
               are, effectively, *new* entities, and not saved to the ``Doc``
               from which they came. This is irritating but unavoidable, since
               this function is not meant to have side-effects on document state.
               If you're only using the text of the returned spans, this is no
               big deal, but watch out if you're counting on determiner-less
               entities associated with the doc downstream.

        min_freq: Remove entities that occur in ``doclike`` fewer
            than ``min_freq`` times

    Yields:
        Next entity from ``doclike`` passing all specified filters in order of appearance
        in the document

    Raises:
        TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of
            str, or a falsy value
    """
    ents = doclike.ents
    # HACK: spacy's models have been erroneously tagging whitespace as entities
    # https://github.com/explosion/spaCy/commit/1e6725e9b734862e61081a916baf440697b9971e
    ents = (ent for ent in ents if not ent.text.isspace())
    include_types = _parse_ent_types(include_types, "include")
    exclude_types = _parse_ent_types(exclude_types, "exclude")
    if include_types:
        if isinstance(include_types, str):
            ents = (ent for ent in ents if ent.label_ == include_types)
        elif isinstance(include_types, (set, frozenset, list, tuple)):
            ents = (ent for ent in ents if ent.label_ in include_types)
    if exclude_types:
        if isinstance(exclude_types, str):
            ents = (ent for ent in ents if ent.label_ != exclude_types)
        elif isinstance(exclude_types, (set, frozenset, list, tuple)):
            ents = (ent for ent in ents if ent.label_ not in exclude_types)
    if drop_determiners is True:
        ents = (
            ent
            if ent[0].pos != DET
            else Span(
                ent.doc, ent.start + 1, ent.end, label=ent.label, vector=ent.vector
            )
            for ent in ents
        )
    if min_freq > 1:
        ents = list(ents)
        freqs = itertoolz.frequencies(ent.text.lower() for ent in ents)
        ents = (ent for ent in ents if freqs[ent.text.lower()] >= min_freq)

    for ent in ents:
        yield ent


def _parse_ent_types(
    ent_types: Optional[str | Collection[str]], which: str,
) -> Optional[str | Set[str]]:
    if not ent_types:
        return None
    elif isinstance(ent_types, str):
        ent_types = ent_types.upper()
        # replace the shorthand numeric case by its corresponding constant
        if ent_types == "NUMERIC":
            return constants.NUMERIC_ENT_TYPES
        else:
            return ent_types
    elif isinstance(ent_types, (set, frozenset, list, tuple)):
        ent_types = {ent_type.upper() for ent_type in ent_types}
        # again, replace the shorthand numeric case by its corresponding constant
        # and include it in the set in case other types are specified
        if any(ent_type == "NUMERIC" for ent_type in ent_types):
            return ent_types.union(constants.NUMERIC_ENT_TYPES)
        else:
            return ent_types
    else:
        raise TypeError(
            errors.type_invalid_msg(
                f"{which}_types", type(ent_types), Optional[Union[str, Collection[str]]]
            )
        )


[docs]def noun_chunks(
    doclike: types.DocLike, *, drop_determiners: bool = True, min_freq: int = 1,
) -> Iterable[Span]:
    """
    Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
    filtering by frequency and dropping leading determiners.

    Args:
        doclike
        drop_determiners: Remove leading determiners (e.g. "the")
            from phrases (e.g. "the quick brown fox" => "quick brown fox")
        min_freq: Remove chunks that occur in ``doclike`` fewer than ``min_freq`` times

    Yields:
        Next noun chunk from ``doclike`` in order of appearance in the document
    """
    ncs = doclike.noun_chunks
    if drop_determiners is True:
        ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs)
    if min_freq > 1:
        ncs = list(ncs)
        freqs = itertoolz.frequencies(nc.text.lower() for nc in ncs)
        ncs = (nc for nc in ncs if freqs[nc.text.lower()] >= min_freq)

    for nc in ncs:
        yield nc


[docs]def terms(
    doclike: types.DocLike,
    *,
    ngs: Optional[int | Collection[int] | types.DocLikeToSpans] = None,
    ents: Optional[bool | types.DocLikeToSpans] = None,
    ncs: Optional[bool | types.DocLikeToSpans] = None,
    dedupe: bool = True,
) -> Iterable[Span]:
    """
    Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks --
    from ``doclike`` as a single, concatenated collection, with optional deduplication
    of spans extracted by more than one type.

    .. code-block:: pycon

        >>> extract.terms(doc, ngs=2, ents=True, ncs=True)
        >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2))
        >>> extract.terms(doc, ents=extract.entities)
        >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON"))

    Args:
        doclike
        ngs: N-gram terms to be extracted.
            If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is
            used to extract terms; if a callable, ``ngs(doclike)`` is used to extract
            terms; if None, no n-gram terms are extracted.
        ents: Entity terms to be extracted.
            If True, :func:`textacy.extract.entities(doclike)` is used to extract terms;
            if a callable, ``ents(doclike)`` is used to extract terms;
            if None, no entity terms are extracted.
        ncs: Noun chunk terms to be extracted.
            If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract
            terms; if a callable, ``ncs(doclike)`` is used to extract terms;
            if None, no noun chunk terms are extracted.
        dedupe: If True, deduplicate terms whose spans are extracted by multiple types
            (e.g. a span that is both an n-gram and an entity), as identified by
            identical (start, stop) indexes in ``doclike``; otherwise, don't.

    Returns:
        Next term from ``doclike``, in order of n-grams then entities then noun chunks,
        with each collection's terms given in order of appearance.

    Note:
        This function is *not* to be confused with keyterm extraction, which leverages
        statistics and algorithms to quantify the "key"-ness of terms before returning
        the top-ranking terms. There is no such scoring or ranking here.

    See Also:
        - :func:`textacy.extact.ngrams()`
        - :func:`textacy.extact.entities()`
        - :func:`textacy.extact.noun_chunks()`
        - :mod:`textacy.extact.keyterms`
    """
    extractors = _get_extractors(ngs, ents, ncs)
    terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors)
    if dedupe is True:
        terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end))
    for term in terms_:
        yield term


def _get_extractors(ngs, ents, ncs) -> List[types.DocLikeToSpans]:
    all_extractors = [
        _get_ngs_extractor(ngs), _get_ents_extractor(ents), _get_ncs_extractor(ncs)
    ]
    extractors = [extractor for extractor in all_extractors if extractor is not None]
    if not extractors:
        raise ValueError("at least one term extractor must be specified")
    else:
        return extractors


def _get_ngs_extractor(ngs) -> Optional[types.DocLikeToSpans]:
    if ngs is None:
        return None
    elif callable(ngs):
        return ngs
    elif (
        isinstance(ngs, int)
        or (isinstance(ngs, Collection) and all(isinstance(ng, int) for ng in ngs))
    ):
        return partial(ngrams, n=ngs)
    else:
        raise TypeError()


def _get_ents_extractor(ents) -> Optional[types.DocLikeToSpans]:
    if ents is None:
        return None
    elif callable(ents):
        return ents
    elif isinstance(ents, bool):
        return entities
    else:
        raise TypeError()


def _get_ncs_extractor(ncs) -> Optional[types.DocLikeToSpans]:
    if ncs is None:
        return None
    elif callable(ncs):
        return ncs
    elif isinstance(ncs, bool):
        return noun_chunks
    else:
        raise TypeError()
Source code for textacy.extract.basics

Navigation

Related Topics