"""
Readability Stats
-----------------
:mod:`textacy.text_stats.readability`: Low-level functions for computing various measures
of text "readability", typically accessed via :class:`textacy.text_stats.TextStats`.
"""
import logging
import statistics
from math import sqrt
from typing import Collection, Optional
from .. import errors
LOGGER = logging.getLogger(__name__)
_FRE_COEFS = {
"de": {"base": 180.0, "asl": 1.0, "awl": 58.5},
"en": {"base": 206.835, "asl": 1.015, "awl": 84.6},
"es": {"base": 206.835, "asl": 1.02, "awl": 60.0}, # 0.6 x 100
"fr": {"base": 207.0, "asl": 1.015, "awl": 73.6},
"it": {"base": 217.0, "asl": 1.3, "awl": 60.0}, # 0.6 x 100
"nl": {"base": 206.835, "asl": 0.93, "awl": 77.0},
"pt": {"base": 248.835, "asl": 1.015, "awl": 84.6},
"ru": {"base": 206.835, "asl": 1.3, "awl": 60.1},
"tr": {"base": 198.825, "asl": 2.610, "awl": 40.175},
}
[docs]def automated_readability_index(n_chars: int, n_words: int, n_sents: int) -> float:
"""
Readability test for English-language texts, particularly for technical writing,
whose value estimates the U.S. grade level required to understand a text.
Similar to several other tests (e.g. :func:`flesch_kincaid_grade_level()`),
but uses characters per word instead of syllables like :func:`coleman_liau_index()`.
Higher value => more difficult text.
References:
https://en.wikipedia.org/wiki/Automated_readability_index
"""
return (4.71 * n_chars / n_words) + (0.5 * n_words / n_sents) - 21.43
[docs]def automatic_arabic_readability_index(
n_chars: int, n_words: int, n_sents: int,
) -> float:
"""
Readability test for Arabic-language texts based on number of characters and
average word and sentence lengths. Higher value => more difficult text.
References:
Al Tamimi, Abdel Karim, et al. "AARI: automatic arabic readability index."
Int. Arab J. Inf. Technol. 11.4 (2014): 370-378.
"""
return (3.28 * n_chars) + (1.43 * n_chars / n_words) + (1.24 * n_words / n_sents)
[docs]def coleman_liau_index(n_chars: int, n_words: int, n_sents: int) -> float:
"""
Readability test whose value estimates the number of years of education
required to understand a text, similar to :func:`flesch_kincaid_grade_level()`
and :func:`smog_index()`, but using characters per word instead of syllables.
Higher value => more difficult text.
References:
https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
"""
return (5.879851 * n_chars / n_words) - (29.587280 * n_sents / n_words) - 15.800804
[docs]def flesch_kincaid_grade_level(n_syllables: int, n_words: int, n_sents: int) -> float:
"""
Readability test used widely in education, whose value estimates the U.S.
grade level / number of years of education required to understand a text.
Higher value => more difficult text.
References:
https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch.E2.80.93Kincaid_grade_level
""" # noqa: E501
return (11.8 * n_syllables / n_words) + (0.39 * n_words / n_sents) - 15.59
[docs]def flesch_reading_ease(
n_syllables: int, n_words: int, n_sents: int, *, lang: Optional[str] = None,
) -> float:
"""
Readability test used as a general-purpose standard in several languages, based on
a weighted combination of avg. sentence length and avg. word length. Values usually
fall in the range [0, 100], but may be arbitrarily negative in extreme cases.
Higher value => easier text.
Note:
Coefficients in this formula are language-dependent;
if ``lang`` is null, the English-language formulation is used.
References:
English: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
German: https://de.wikipedia.org/wiki/Lesbarkeitsindex#Flesch-Reading-Ease
Spanish: Fernández-Huerta formulation
French: ?
Italian: https://it.wikipedia.org/wiki/Formula_di_Flesch
Dutch: ?
Portuguese: https://pt.wikipedia.org/wiki/Legibilidade_de_Flesch
Turkish: Atesman formulation
Russian: https://ru.wikipedia.org/wiki/%D0%98%D0%BD%D0%B4%D0%B5%D0%BA%D1%81_%D1%83%D0%B4%D0%BE%D0%B1%D0%BE%D1%87%D0%B8%D1%82%D0%B0%D0%B5%D0%BC%D0%BE%D1%81%D1%82%D0%B8
""" # noqa: E501
try:
coefs = _FRE_COEFS[lang or "en"]
except KeyError:
raise ValueError(errors.value_invalid_msg("lang", lang, list(_FRE_COEFS.keys())))
return (
coefs["base"]
- (coefs["asl"] * n_words / n_sents)
- (coefs["awl"] * n_syllables / n_words)
)
[docs]def gulpease_index(n_chars: int, n_words: int, n_sents: int) -> float:
"""
Readability test for Italian-language texts, whose value is in the range
[0, 100] similar to :func:`flesch_reading_ease()`. Higher value =>
easier text.
References:
https://it.wikipedia.org/wiki/Indice_Gulpease
"""
return (300 * n_sents / n_words) - (10 * n_chars / n_words) + 89
[docs]def gunning_fog_index(n_words: int, n_polysyllable_words: int, n_sents: int) -> float:
"""
Readability test whose value estimates the number of years of education
required to understand a text, similar to :func:`flesch_kincaid_grade_level()`
and :func:`smog_index()`. Higher value => more difficult text.
References:
https://en.wikipedia.org/wiki/Gunning_fog_index
"""
return 0.4 * ((n_words / n_sents) + (100 * n_polysyllable_words / n_words))
[docs]def lix(n_words: int, n_long_words: int, n_sents: int) -> float:
"""
Readability test commonly used in Sweden on both English- and non-English-language
texts, whose value estimates the difficulty of reading a foreign text.
Higher value => more difficult text.
References:
https://en.wikipedia.org/wiki/Lix_(readability_test)
"""
return (n_words / n_sents) + (100 * n_long_words / n_words)
[docs]def mu_legibility_index(n_chars_per_word: Collection[int]) -> float:
"""
Readability test for Spanish-language texts based on number of words and
the mean and variance of their lengths in characters, whose value is in the range
[0, 100]. Higher value => easier text.
References:
Muñoz, M., and J. Muñoz. "Legibilidad Mµ." Viña del Mar: CHL (2006).
"""
n_words = len(n_chars_per_word)
if n_words < 2:
LOGGER.warning(
"mu legibility index is undefined for texts with fewer than two words; "
"returning 0.0"
)
return 0.0
return (
100
* (n_words / (n_words - 1))
* (statistics.mean(n_chars_per_word) / statistics.variance(n_chars_per_word))
)
[docs]def perspicuity_index(n_syllables: int, n_words: int, n_sents: int) -> float:
"""
Readability test for Spanish-language texts, whose value is in the range [0, 100];
very similar to the Spanish-specific formulation of :func:`flesch_reading_ease()`,
but included additionally since it's become a common readability standard.
Higher value => easier text.
References:
Pazos, Francisco Szigriszt. Sistemas predictivos de legibilidad del mensaje
escrito: fórmula de perspicuidad. Universidad Complutense de Madrid,
Servicio de Reprografía, 1993.
"""
return 206.835 - (n_words / n_sents) - (62.3 * (n_syllables / n_words))
[docs]def smog_index(n_polysyllable_words: int, n_sents: int) -> float:
"""
Readability test commonly used in medical writing and the healthcare industry,
whose value estimates the number of years of education required to understand a text
similar to :func:`flesch_kincaid_grade_level()` and intended as a substitute for
:func:`gunning_fog_index()`. Higher value => more difficult text.
References:
https://en.wikipedia.org/wiki/SMOG
"""
if n_sents < 30:
LOGGER.warning("SMOG index may be unreliable for n_sents < 30")
return (1.0430 * sqrt(30 * n_polysyllable_words / n_sents)) + 3.1291
[docs]def wiener_sachtextformel(
n_words: int,
n_polysyllable_words: int,
n_monosyllable_words: int,
n_long_words: int,
n_sents: int,
*,
variant: int = 1,
) -> float:
"""
Readability test for German-language texts, whose value estimates the grade
level required to understand a text. Higher value => more difficult text.
References:
https://de.wikipedia.org/wiki/Lesbarkeitsindex#Wiener_Sachtextformel
"""
ms = 100 * n_polysyllable_words / n_words
sl = n_words / n_sents
iw = 100 * n_long_words / n_words
es = 100 * n_monosyllable_words / n_words
if variant == 1:
return (0.1935 * ms) + (0.1672 * sl) + (0.1297 * iw) - (0.0327 * es) - 0.875
elif variant == 2:
return (0.2007 * ms) + (0.1682 * sl) + (0.1373 * iw) - 2.779
elif variant == 3:
return (0.2963 * ms) + (0.1905 * sl) - 1.1144
elif variant == 4:
return (0.2744 * ms) + (0.2656 * sl) - 1.693
else:
raise ValueError(errors.value_invalid_msg("variant", variant, [1, 2, 3, 4]))