"""
:mod:`textacy.text_stats.api`: Compute basic and readability statistics of documents.
"""
import functools
import logging
from typing import Optional, Tuple
import pyphen
from cachetools import cached
from cachetools.keys import hashkey
from spacy.tokens import Doc
from .. import cache, extract
from . import basics, readability
LOGGER = logging.getLogger(__name__)
[docs]class TextStats:
"""
Class to compute a variety of basic and readability statistics for a given doc,
where each stat is a lazily-computed attribute.
.. code-block:: pycon
>>> text = next(textacy.datasets.CapitolWords().texts(limit=1))
>>> doc = textacy.make_spacy_doc(text)
>>> ts = textacy.text_stats.TextStats(doc)
>>> ts.n_words
136
>>> ts.n_unique_words
80
>>> ts.entropy
6.00420319027642
>>> ts.flesch_kincaid_grade_level
11.817647058823532
>>> ts.flesch_reading_ease
50.707745098039254
Some stats vary by language or are designed for use with specific languages:
.. code-block:: pycon
>>> text = (
... "Muchos años después, frente al pelotón de fusilamiento, "
... "el coronel Aureliano Buendía había de recordar aquella tarde remota "
... "en que su padre lo llevó a conocer el hielo."
... )
>>> doc = textacy.make_spacy_doc(text, lang="es")
>>> ts = textacy.text_stats.TextStats(doc)
>>> ts.n_words
28
>>> ts.perspicuity_index
56.46000000000002
>>> ts.mu_legibility_index
71.18644067796609
Each of these stats have stand-alone functions in :mod:`textacy.text_stats.basics`
and :mod:`textacy.text_stats.readability` with more detailed info and links
in the docstrings -- when in doubt, read the docs!
Args:
doc: A text document tokenized and (optionally) sentence-segmented by spaCy.
"""
def __init__(self, doc: Doc):
self.doc = doc
self.lang = doc.vocab.lang
self.words = tuple(
extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)
)
self._n_sents: Optional[int] = None
self._n_words: Optional[int] = None
self._n_unique_words: Optional[int] = None
self._n_long_words: Optional[int] = None
self._n_chars_per_word: Optional[Tuple[int, ...]] = None
self._n_chars: Optional[int] = None
self._n_syllables_per_word: Optional[Tuple[int, ...]] = None
self._n_syllables: Optional[int] = None
self._n_monosyllable_words: Optional[int] = None
self._n_polysyllable_words: Optional[int] = None
self._entropy: Optional[float] = None
@property
def n_sents(self) -> int:
"""
Number of sentences in document.
See Also:
:func:`textacy.text_stats.basics.n_sents()`
"""
if self._n_sents is None:
self._n_sents = basics.n_sents(self.doc)
return self._n_sents
@property
def n_words(self) -> int:
"""
Number of words in document.
See Also:
:func:`textacy.text_stats.basics.n_words()`
"""
if self._n_words is None:
self._n_words = basics.n_words(self.words)
return self._n_words
@property
def n_unique_words(self) -> int:
"""
Number of *unique* words in document.
See Also:
:func:`textacy.text_stats.basics.n_unique_words()`
"""
if self._n_unique_words is None:
self._n_unique_words = basics.n_unique_words(self.words)
return self._n_unique_words
@property
def n_long_words(self) -> int:
"""
Number of long words in document.
See Also:
:func:`textacy.text_stats.basics.n_long_words()`
"""
# TODO: should we vary char threshold by lang?
if self._n_long_words is None:
self._n_long_words = basics.n_long_words(
self.n_chars_per_word, min_n_chars=7,
)
return self._n_long_words
@property
def n_chars_per_word(self) -> Tuple[int, ...]:
"""
Number of characters for each word in document.
See Also:
:func:`textacy.text_stats.basics.n_chars_per_word()`
"""
if self._n_chars_per_word is None:
self._n_chars_per_word = basics.n_chars_per_word(self.words)
return self._n_chars_per_word
@property
def n_chars(self) -> int:
"""
Total number of characters in document.
See Also:
:func:`textacy.text_stats.basics.n_chars()`
"""
if self._n_chars is None:
self._n_chars = basics.n_chars(self.n_chars_per_word)
return self._n_chars
@property
def n_syllables_per_word(self) -> Tuple[int, ...]:
"""
Number of syllables for each word in document.
See Also:
:func:`textacy.text_stats.basics.n_syllables_per_word()`
"""
if self._n_syllables_per_word is None:
self._n_syllables_per_word = basics.n_syllables_per_word(
self.words, self.lang,
)
return self._n_syllables_per_word
@property
def n_syllables(self) -> int:
"""
Total number of syllables in document.
See Also:
:func:`textacy.text_stats.basics.n_syllables()`
"""
if self._n_syllables is None:
self._n_syllables = basics.n_syllables(self.n_syllables_per_word)
return self._n_syllables
@property
def n_monosyllable_words(self) -> int:
"""
Number of monosyllobic words in document.
See Also:
:func:`textacy.text_stats.basics.n_monosyllable_words()`
"""
if self._n_monosyllable_words is None:
self._n_monosyllable_words = basics.n_monosyllable_words(
self.n_syllables_per_word,
)
return self._n_monosyllable_words
@property
def n_polysyllable_words(self) -> int:
"""
Number of polysyllobic words in document.
See Also:
:func:`textacy.text_stats.basics.n_polysyllable_words()`
"""
# TODO: should we vary syllable threshold by lang?
if self._n_polysyllable_words is None:
self._n_polysyllable_words = basics.n_polysyllable_words(
self.n_syllables_per_word, min_n_syllables=3,
)
return self._n_polysyllable_words
@property
def entropy(self) -> float:
"""
Entropy of words in document.
See Also:
:func:`textacy.text_stats.basics.entropy()`
"""
if self._entropy is None:
self._entropy = basics.entropy(self.words)
return self._entropy
@property
def automated_readability_index(self) -> float:
"""
Readability test for English-language texts. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.automated_readability_index()`
"""
return readability.automated_readability_index(
self.n_chars, self.n_words, self.n_sents,
)
@property
def automatic_arabic_readability_index(self) -> float:
"""
Readability test for Arabic-language texts. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.automatic_arabic_readability_index()`
"""
if self.lang != "ar":
LOGGER.warning(
"doc lang = '%s', but automatic arabic readability index is meant "
"for use on Arabic-language texts, only"
)
return readability.automatic_arabic_readability_index(
self.n_chars, self.n_words, self.n_sents,
)
@property
def coleman_liau_index(self) -> float:
"""
Readability test, not language-specific. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.coleman_liau_index()`
"""
return readability.coleman_liau_index(self.n_chars, self.n_words, self.n_sents)
@property
def flesch_kincaid_grade_level(self) -> float:
"""
Readability test, not language-specific. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.flesch_kincaid_grade_level()`
"""
return readability.flesch_kincaid_grade_level(
self.n_syllables, self.n_words, self.n_sents,
)
@property
def flesch_reading_ease(self) -> float:
"""
Readability test with several language-specific formulations.
Higher value => easier text.
See Also:
:func:`textacy.text_stats.readability.flesch_reading_ease()`
"""
return readability.flesch_reading_ease(
self.n_syllables, self.n_words, self.n_sents, lang=self.lang
)
@property
def gulpease_index(self) -> float:
"""
Readability test for Italian-language texts. Higher value => easier text.
See Also:
:func:`textacy.text_stats.readability.gulpease_index()`
"""
if self.lang != "it":
LOGGER.warning(
"doc lang = '%s', but gulpease index is meant for use on "
"Italian-language texts, only"
)
return readability.gulpease_index(self.n_chars, self.n_words, self.n_sents)
@property
def gunning_fog_index(self) -> float:
"""
Readability test, not language-specific. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.gunning_fog_index()`
"""
return readability.gunning_fog_index(
self.n_words, self.n_polysyllable_words, self.n_sents,
)
@property
def lix(self) -> float:
"""
Readability test for both English- and non-English-language texts.
Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.lix()`
"""
return readability.lix(self.n_words, self.n_long_words, self.n_sents)
@property
def mu_legibility_index(self) -> float:
"""
Readability test for Spanish-language texts. Higher value => easier text.
See Also:
:func:`textacy.text_stats.readability.mu_legibility_index()`
"""
if self.lang != "es":
LOGGER.warning(
"doc lang = '%s', but mu legibility index is meant for use on "
"Spanish-language texts, only"
)
return readability.mu_legibility_index(self.n_chars_per_word)
@property
def perspicuity_index(self) -> float:
"""
Readability test for Spanish-language texts. Higher value => easier text.
See Also:
:func:`textacy.text_stats.readability.perspicuity_index()`
"""
if self.lang != "es":
LOGGER.warning(
"doc lang = '%s', but perspicuity index is meant for use on "
"Spanish-language texts, only"
)
return readability.perspicuity_index(
self.n_syllables, self.n_words, self.n_sents,
)
@property
def smog_index(self) -> float:
"""
Readability test, not language-specific. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.smog_index()`
"""
return readability.smog_index(self.n_polysyllable_words, self.n_sents)
@property
def wiener_sachtextformel(self) -> float:
"""
Readability test for German-language texts. Higher value => more difficult text.
See Also:
:func:`textacy.text_stats.readability.wiener_sachtextformel()`
"""
if self.lang != "es":
LOGGER.warning(
"doc lang = '%s', but wiener sachtextformel is meant for use on "
"German-language texts, only"
)
return readability.wiener_sachtextformel(
self.n_words,
self.n_polysyllable_words,
self.n_monosyllable_words,
self.n_long_words,
self.n_sents,
variant=1,
)
[docs]@cached(cache.LRU_CACHE, key=functools.partial(hashkey, "hyphenator"))
def load_hyphenator(lang: str):
"""
Load an object that hyphenates words at valid points, as used in LaTex typesetting.
Args:
lang: Standard 2-letter language abbreviation. To get a list of valid values::
>>> import pyphen; pyphen.LANGUAGES
Returns:
:class:`pyphen.Pyphen()`
"""
LOGGER.debug("loading '%s' language hyphenator", lang)
return pyphen.Pyphen(lang=lang)