Source code for textacy.text_stats.components

"""
Pipeline Components
-------------------

:mod:`textacy.text_stats.components`: Custom components to add to a spaCy language pipeline.
"""
# TODO: figure out why this breaks the code...
# from __future__ import annotations

import inspect
import logging
from typing import Collection, Optional, Union

from spacy.language import Language
from spacy.tokens import Doc

from . import api

LOGGER = logging.getLogger(__name__)


@Language.factory(
    "textacy_text_stats",
    default_config={"attrs": None},
    retokenizes=False,
)
def create_text_stats_component(
    nlp: Language, name: str, attrs: Optional[Union[str, Collection[str]]]
):
    return TextStatsComponent(attrs=attrs)


[docs]class TextStatsComponent: """ A custom component to be added to a spaCy language pipeline that computes one, some, or all text stats for a parsed doc and sets the values as custom attributes on a :class:`spacy.tokens.Doc`. Add the component to a pipeline, *after* the parser and any subsequent components that modify the tokens/sentences of the doc (to be safe, just put it last):: >>> en = spacy.load("en_core_web_sm") >>> en.add_pipe("textacy_text_stats", last=True) Process a text with the pipeline and access the custom attributes via spaCy's underscore syntax:: >>> doc = en(u"This is a test test someverylongword.") >>> doc._.n_words 6 >>> doc._.flesch_reading_ease 73.84500000000001 Specify which attributes of the :class:`textacy.text_stats.TextStats()` to add to processed documents:: >>> en = spacy.load("en_core_web_sm") >>> en.add_pipe("textacy_text_stats", last=True, config={"attrs": "n_words"}) >>> doc = en(u"This is a test test someverylongword.") >>> doc._.n_words 6 >>> doc._.flesch_reading_ease AttributeError: [E046] Can't retrieve unregistered extension attribute 'flesch_reading_ease'. Did you forget to call the `set_extension` method? Args: attr: If str, a single text stat to compute and set on a :obj:`Doc`; if Iterable[str], set multiple text stats; if None, *all* text stats are computed and set as extensions. See Also: :class:`textacy.text_stats.TextStats` """ def __init__(self, attrs: Optional[Union[str, Collection[str]]] = None): self._set_attrs(attrs) for attr in self.attrs: # TODO: see if there's a better way to handle this # that doesn't involve clobbering existing property extensions Doc.set_extension(attr, default=None, force=True) LOGGER.debug('"%s" custom attribute added to `spacy.tokens.Doc`') def __call__(self, doc: Doc) -> Doc: ts = api.TextStats(doc) for attr in self.attrs: try: doc._.set(attr, getattr(ts, attr)) except AttributeError: LOGGER.exception( "`TextStats` class doesn't have '%s' attribute, so it can't " "be set on this `Doc`. Check the attrs used to initialize " "the `TextStatsComponent` in this pipeline for errors.", attr, ) raise return doc def _set_attrs(self, attrs: Optional[Union[str, Collection[str]]]): if attrs is None: self.attrs = tuple( name for name, _ in inspect.getmembers( api.TextStats, lambda memb: not(inspect.isroutine(memb)) ) if not name.startswith("_") ) elif isinstance(attrs, str): self.attrs = (attrs,) else: self.attrs = tuple(attrs)