Source code for textacy.text_stats.readability

"""
Readability Stats
-----------------

:mod:`textacy.text_stats.readability`: Low-level functions for computing various measures
of text "readability", typically accessed via :class:`textacy.text_stats.TextStats`.
"""
import logging
import statistics
from math import sqrt
from typing import Collection, Optional

from .. import errors


LOGGER = logging.getLogger(__name__)

_FRE_COEFS = {
    "de": {"base": 180.0, "asl": 1.0, "awl": 58.5},
    "en": {"base": 206.835, "asl": 1.015, "awl": 84.6},
    "es": {"base": 206.835, "asl": 1.02, "awl": 60.0},  # 0.6 x 100
    "fr": {"base": 207.0, "asl": 1.015, "awl": 73.6},
    "it": {"base": 217.0, "asl": 1.3, "awl": 60.0},  # 0.6 x 100
    "nl": {"base": 206.835, "asl": 0.93, "awl": 77.0},
    "pt": {"base": 248.835, "asl": 1.015, "awl": 84.6},
    "ru": {"base": 206.835, "asl": 1.3, "awl": 60.1},
    "tr": {"base": 198.825, "asl": 2.610, "awl": 40.175},
}


[docs]def automated_readability_index(n_chars: int, n_words: int, n_sents: int) -> float: """ Readability test for English-language texts, particularly for technical writing, whose value estimates the U.S. grade level required to understand a text. Similar to several other tests (e.g. :func:`flesch_kincaid_grade_level()`), but uses characters per word instead of syllables like :func:`coleman_liau_index()`. Higher value => more difficult text. References: https://en.wikipedia.org/wiki/Automated_readability_index """ return (4.71 * n_chars / n_words) + (0.5 * n_words / n_sents) - 21.43
[docs]def automatic_arabic_readability_index( n_chars: int, n_words: int, n_sents: int, ) -> float: """ Readability test for Arabic-language texts based on number of characters and average word and sentence lengths. Higher value => more difficult text. References: Al Tamimi, Abdel Karim, et al. "AARI: automatic arabic readability index." Int. Arab J. Inf. Technol. 11.4 (2014): 370-378. """ return (3.28 * n_chars) + (1.43 * n_chars / n_words) + (1.24 * n_words / n_sents)
[docs]def coleman_liau_index(n_chars: int, n_words: int, n_sents: int) -> float: """ Readability test whose value estimates the number of years of education required to understand a text, similar to :func:`flesch_kincaid_grade_level()` and :func:`smog_index()`, but using characters per word instead of syllables. Higher value => more difficult text. References: https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index """ return (5.879851 * n_chars / n_words) - (29.587280 * n_sents / n_words) - 15.800804
[docs]def flesch_kincaid_grade_level(n_syllables: int, n_words: int, n_sents: int) -> float: """ Readability test used widely in education, whose value estimates the U.S. grade level / number of years of education required to understand a text. Higher value => more difficult text. References: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch.E2.80.93Kincaid_grade_level """ # noqa: E501 return (11.8 * n_syllables / n_words) + (0.39 * n_words / n_sents) - 15.59
[docs]def flesch_reading_ease( n_syllables: int, n_words: int, n_sents: int, *, lang: Optional[str] = None, ) -> float: """ Readability test used as a general-purpose standard in several languages, based on a weighted combination of avg. sentence length and avg. word length. Values usually fall in the range [0, 100], but may be arbitrarily negative in extreme cases. Higher value => easier text. Note: Coefficients in this formula are language-dependent; if ``lang`` is null, the English-language formulation is used. References: English: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease German: https://de.wikipedia.org/wiki/Lesbarkeitsindex#Flesch-Reading-Ease Spanish: Fernández-Huerta formulation French: ? Italian: https://it.wikipedia.org/wiki/Formula_di_Flesch Dutch: ? Portuguese: https://pt.wikipedia.org/wiki/Legibilidade_de_Flesch Turkish: Atesman formulation Russian: https://ru.wikipedia.org/wiki/%D0%98%D0%BD%D0%B4%D0%B5%D0%BA%D1%81_%D1%83%D0%B4%D0%BE%D0%B1%D0%BE%D1%87%D0%B8%D1%82%D0%B0%D0%B5%D0%BC%D0%BE%D1%81%D1%82%D0%B8 """ # noqa: E501 try: coefs = _FRE_COEFS[lang or "en"] except KeyError: raise ValueError(errors.value_invalid_msg("lang", lang, list(_FRE_COEFS.keys()))) return ( coefs["base"] - (coefs["asl"] * n_words / n_sents) - (coefs["awl"] * n_syllables / n_words) )
[docs]def gulpease_index(n_chars: int, n_words: int, n_sents: int) -> float: """ Readability test for Italian-language texts, whose value is in the range [0, 100] similar to :func:`flesch_reading_ease()`. Higher value => easier text. References: https://it.wikipedia.org/wiki/Indice_Gulpease """ return (300 * n_sents / n_words) - (10 * n_chars / n_words) + 89
[docs]def gunning_fog_index(n_words: int, n_polysyllable_words: int, n_sents: int) -> float: """ Readability test whose value estimates the number of years of education required to understand a text, similar to :func:`flesch_kincaid_grade_level()` and :func:`smog_index()`. Higher value => more difficult text. References: https://en.wikipedia.org/wiki/Gunning_fog_index """ return 0.4 * ((n_words / n_sents) + (100 * n_polysyllable_words / n_words))
[docs]def lix(n_words: int, n_long_words: int, n_sents: int) -> float: """ Readability test commonly used in Sweden on both English- and non-English-language texts, whose value estimates the difficulty of reading a foreign text. Higher value => more difficult text. References: https://en.wikipedia.org/wiki/Lix_(readability_test) """ return (n_words / n_sents) + (100 * n_long_words / n_words)
[docs]def mu_legibility_index(n_chars_per_word: Collection[int]) -> float: """ Readability test for Spanish-language texts based on number of words and the mean and variance of their lengths in characters, whose value is in the range [0, 100]. Higher value => easier text. References: Muñoz, M., and J. Muñoz. "Legibilidad Mµ." Viña del Mar: CHL (2006). """ n_words = len(n_chars_per_word) if n_words < 2: LOGGER.warning( "mu legibility index is undefined for texts with fewer than two words; " "returning 0.0" ) return 0.0 return ( 100 * (n_words / (n_words - 1)) * (statistics.mean(n_chars_per_word) / statistics.variance(n_chars_per_word)) )
[docs]def perspicuity_index(n_syllables: int, n_words: int, n_sents: int) -> float: """ Readability test for Spanish-language texts, whose value is in the range [0, 100]; very similar to the Spanish-specific formulation of :func:`flesch_reading_ease()`, but included additionally since it's become a common readability standard. Higher value => easier text. References: Pazos, Francisco Szigriszt. Sistemas predictivos de legibilidad del mensaje escrito: fórmula de perspicuidad. Universidad Complutense de Madrid, Servicio de Reprografía, 1993. """ return 206.835 - (n_words / n_sents) - (62.3 * (n_syllables / n_words))
[docs]def smog_index(n_polysyllable_words: int, n_sents: int) -> float: """ Readability test commonly used in medical writing and the healthcare industry, whose value estimates the number of years of education required to understand a text similar to :func:`flesch_kincaid_grade_level()` and intended as a substitute for :func:`gunning_fog_index()`. Higher value => more difficult text. References: https://en.wikipedia.org/wiki/SMOG """ if n_sents < 30: LOGGER.warning("SMOG index may be unreliable for n_sents < 30") return (1.0430 * sqrt(30 * n_polysyllable_words / n_sents)) + 3.1291
[docs]def wiener_sachtextformel( n_words: int, n_polysyllable_words: int, n_monosyllable_words: int, n_long_words: int, n_sents: int, *, variant: int = 1, ) -> float: """ Readability test for German-language texts, whose value estimates the grade level required to understand a text. Higher value => more difficult text. References: https://de.wikipedia.org/wiki/Lesbarkeitsindex#Wiener_Sachtextformel """ ms = 100 * n_polysyllable_words / n_words sl = n_words / n_sents iw = 100 * n_long_words / n_words es = 100 * n_monosyllable_words / n_words if variant == 1: return (0.1935 * ms) + (0.1672 * sl) + (0.1297 * iw) - (0.0327 * es) - 0.875 elif variant == 2: return (0.2007 * ms) + (0.1682 * sl) + (0.1373 * iw) - 2.779 elif variant == 3: return (0.2963 * ms) + (0.1905 * sl) - 1.1144 elif variant == 4: return (0.2744 * ms) + (0.2656 * sl) - 1.693 else: raise ValueError(errors.value_invalid_msg("variant", variant, [1, 2, 3, 4]))