Source code for textacy.text_stats.api

"""
:mod:`textacy.text_stats.api`: Compute basic and readability statistics of documents.
"""
import functools
import logging
from typing import Optional, Tuple

import pyphen
from cachetools import cached
from cachetools.keys import hashkey
from spacy.tokens import Doc

from .. import cache, extract
from . import basics, readability


LOGGER = logging.getLogger(__name__)


[docs]class TextStats: """ Class to compute a variety of basic and readability statistics for a given doc, where each stat is a lazily-computed attribute. .. code-block:: pycon >>> text = next(textacy.datasets.CapitolWords().texts(limit=1)) >>> doc = textacy.make_spacy_doc(text) >>> ts = textacy.text_stats.TextStats(doc) >>> ts.n_words 136 >>> ts.n_unique_words 80 >>> ts.entropy 6.00420319027642 >>> ts.flesch_kincaid_grade_level 11.817647058823532 >>> ts.flesch_reading_ease 50.707745098039254 Some stats vary by language or are designed for use with specific languages: .. code-block:: pycon >>> text = ( ... "Muchos años después, frente al pelotón de fusilamiento, " ... "el coronel Aureliano Buendía había de recordar aquella tarde remota " ... "en que su padre lo llevó a conocer el hielo." ... ) >>> doc = textacy.make_spacy_doc(text, lang="es") >>> ts = textacy.text_stats.TextStats(doc) >>> ts.n_words 28 >>> ts.perspicuity_index 56.46000000000002 >>> ts.mu_legibility_index 71.18644067796609 Each of these stats have stand-alone functions in :mod:`textacy.text_stats.basics` and :mod:`textacy.text_stats.readability` with more detailed info and links in the docstrings -- when in doubt, read the docs! Args: doc: A text document tokenized and (optionally) sentence-segmented by spaCy. """ def __init__(self, doc: Doc): self.doc = doc self.lang = doc.vocab.lang self.words = tuple( extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False) ) self._n_sents: Optional[int] = None self._n_words: Optional[int] = None self._n_unique_words: Optional[int] = None self._n_long_words: Optional[int] = None self._n_chars_per_word: Optional[Tuple[int, ...]] = None self._n_chars: Optional[int] = None self._n_syllables_per_word: Optional[Tuple[int, ...]] = None self._n_syllables: Optional[int] = None self._n_monosyllable_words: Optional[int] = None self._n_polysyllable_words: Optional[int] = None self._entropy: Optional[float] = None @property def n_sents(self) -> int: """ Number of sentences in document. See Also: :func:`textacy.text_stats.basics.n_sents()` """ if self._n_sents is None: self._n_sents = basics.n_sents(self.doc) return self._n_sents @property def n_words(self) -> int: """ Number of words in document. See Also: :func:`textacy.text_stats.basics.n_words()` """ if self._n_words is None: self._n_words = basics.n_words(self.words) return self._n_words @property def n_unique_words(self) -> int: """ Number of *unique* words in document. See Also: :func:`textacy.text_stats.basics.n_unique_words()` """ if self._n_unique_words is None: self._n_unique_words = basics.n_unique_words(self.words) return self._n_unique_words @property def n_long_words(self) -> int: """ Number of long words in document. See Also: :func:`textacy.text_stats.basics.n_long_words()` """ # TODO: should we vary char threshold by lang? if self._n_long_words is None: self._n_long_words = basics.n_long_words( self.n_chars_per_word, min_n_chars=7, ) return self._n_long_words @property def n_chars_per_word(self) -> Tuple[int, ...]: """ Number of characters for each word in document. See Also: :func:`textacy.text_stats.basics.n_chars_per_word()` """ if self._n_chars_per_word is None: self._n_chars_per_word = basics.n_chars_per_word(self.words) return self._n_chars_per_word @property def n_chars(self) -> int: """ Total number of characters in document. See Also: :func:`textacy.text_stats.basics.n_chars()` """ if self._n_chars is None: self._n_chars = basics.n_chars(self.n_chars_per_word) return self._n_chars @property def n_syllables_per_word(self) -> Tuple[int, ...]: """ Number of syllables for each word in document. See Also: :func:`textacy.text_stats.basics.n_syllables_per_word()` """ if self._n_syllables_per_word is None: self._n_syllables_per_word = basics.n_syllables_per_word( self.words, self.lang, ) return self._n_syllables_per_word @property def n_syllables(self) -> int: """ Total number of syllables in document. See Also: :func:`textacy.text_stats.basics.n_syllables()` """ if self._n_syllables is None: self._n_syllables = basics.n_syllables(self.n_syllables_per_word) return self._n_syllables @property def n_monosyllable_words(self) -> int: """ Number of monosyllobic words in document. See Also: :func:`textacy.text_stats.basics.n_monosyllable_words()` """ if self._n_monosyllable_words is None: self._n_monosyllable_words = basics.n_monosyllable_words( self.n_syllables_per_word, ) return self._n_monosyllable_words @property def n_polysyllable_words(self) -> int: """ Number of polysyllobic words in document. See Also: :func:`textacy.text_stats.basics.n_polysyllable_words()` """ # TODO: should we vary syllable threshold by lang? if self._n_polysyllable_words is None: self._n_polysyllable_words = basics.n_polysyllable_words( self.n_syllables_per_word, min_n_syllables=3, ) return self._n_polysyllable_words @property def entropy(self) -> float: """ Entropy of words in document. See Also: :func:`textacy.text_stats.basics.entropy()` """ if self._entropy is None: self._entropy = basics.entropy(self.words) return self._entropy @property def automated_readability_index(self) -> float: """ Readability test for English-language texts. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.automated_readability_index()` """ return readability.automated_readability_index( self.n_chars, self.n_words, self.n_sents, ) @property def automatic_arabic_readability_index(self) -> float: """ Readability test for Arabic-language texts. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.automatic_arabic_readability_index()` """ if self.lang != "ar": LOGGER.warning( "doc lang = '%s', but automatic arabic readability index is meant " "for use on Arabic-language texts, only" ) return readability.automatic_arabic_readability_index( self.n_chars, self.n_words, self.n_sents, ) @property def coleman_liau_index(self) -> float: """ Readability test, not language-specific. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.coleman_liau_index()` """ return readability.coleman_liau_index(self.n_chars, self.n_words, self.n_sents) @property def flesch_kincaid_grade_level(self) -> float: """ Readability test, not language-specific. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.flesch_kincaid_grade_level()` """ return readability.flesch_kincaid_grade_level( self.n_syllables, self.n_words, self.n_sents, ) @property def flesch_reading_ease(self) -> float: """ Readability test with several language-specific formulations. Higher value => easier text. See Also: :func:`textacy.text_stats.readability.flesch_reading_ease()` """ return readability.flesch_reading_ease( self.n_syllables, self.n_words, self.n_sents, lang=self.lang ) @property def gulpease_index(self) -> float: """ Readability test for Italian-language texts. Higher value => easier text. See Also: :func:`textacy.text_stats.readability.gulpease_index()` """ if self.lang != "it": LOGGER.warning( "doc lang = '%s', but gulpease index is meant for use on " "Italian-language texts, only" ) return readability.gulpease_index(self.n_chars, self.n_words, self.n_sents) @property def gunning_fog_index(self) -> float: """ Readability test, not language-specific. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.gunning_fog_index()` """ return readability.gunning_fog_index( self.n_words, self.n_polysyllable_words, self.n_sents, ) @property def lix(self) -> float: """ Readability test for both English- and non-English-language texts. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.lix()` """ return readability.lix(self.n_words, self.n_long_words, self.n_sents) @property def mu_legibility_index(self) -> float: """ Readability test for Spanish-language texts. Higher value => easier text. See Also: :func:`textacy.text_stats.readability.mu_legibility_index()` """ if self.lang != "es": LOGGER.warning( "doc lang = '%s', but mu legibility index is meant for use on " "Spanish-language texts, only" ) return readability.mu_legibility_index(self.n_chars_per_word) @property def perspicuity_index(self) -> float: """ Readability test for Spanish-language texts. Higher value => easier text. See Also: :func:`textacy.text_stats.readability.perspicuity_index()` """ if self.lang != "es": LOGGER.warning( "doc lang = '%s', but perspicuity index is meant for use on " "Spanish-language texts, only" ) return readability.perspicuity_index( self.n_syllables, self.n_words, self.n_sents, ) @property def smog_index(self) -> float: """ Readability test, not language-specific. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.smog_index()` """ return readability.smog_index(self.n_polysyllable_words, self.n_sents) @property def wiener_sachtextformel(self) -> float: """ Readability test for German-language texts. Higher value => more difficult text. See Also: :func:`textacy.text_stats.readability.wiener_sachtextformel()` """ if self.lang != "es": LOGGER.warning( "doc lang = '%s', but wiener sachtextformel is meant for use on " "German-language texts, only" ) return readability.wiener_sachtextformel( self.n_words, self.n_polysyllable_words, self.n_monosyllable_words, self.n_long_words, self.n_sents, variant=1, )
[docs]@cached(cache.LRU_CACHE, key=functools.partial(hashkey, "hyphenator")) def load_hyphenator(lang: str): """ Load an object that hyphenates words at valid points, as used in LaTex typesetting. Args: lang: Standard 2-letter language abbreviation. To get a list of valid values:: >>> import pyphen; pyphen.LANGUAGES Returns: :class:`pyphen.Pyphen()` """ LOGGER.debug("loading '%s' language hyphenator", lang) return pyphen.Pyphen(lang=lang)