Source code for textacy.preprocessing.normalize

"""
Normalize
---------

:mod:`textacy.preprocessing.normalize`: Normalize aspects of raw text that may vary
in problematic ways.
"""
import re
import unicodedata

from . import resources


[docs]def bullet_points(text: str) -> str:
    """
    Normalize all "fancy" bullet point symbols in ``text`` to just the basic ASCII "-",
    provided they are the first non-whitespace characters on a new line
    (like a list of items).
    """
    return resources.RE_BULLET_POINTS.sub(r"\1-", text)


[docs]def hyphenated_words(text: str) -> str:
    """
    Normalize words in ``text`` that have been split across lines by a hyphen
    for visual consistency (aka hyphenated) by joining the pieces back together,
    sans hyphen and whitespace.
    """
    return resources.RE_HYPHENATED_WORD.sub(r"\1\2", text)


[docs]def quotation_marks(text: str) -> str:
    """
    Normalize all "fancy" single- and double-quotation marks in ``text``
    to just the basic ASCII equivalents. Note that this will also normalize fancy
    apostrophes, which are typically represented as single quotation marks.
    """
    return text.translate(resources.QUOTE_TRANSLATION_TABLE)


[docs]def repeating_chars(text: str, *, chars: str, maxn: int = 1) -> str:
    """
    Normalize repeating characters in ``text`` by truncating their number of consecutive
    repetitions to ``maxn``.

    Args:
        text
        chars: One or more characters whose consecutive repetitions are to be normalized,
            e.g. "." or "?!".
        maxn: Maximum number of consecutive repetitions of ``chars`` to which
            longer repetitions will be truncated.

    Returns:
        str
    """
    return re.sub(r"({}){{{},}}".format(re.escape(chars), maxn + 1), chars * maxn, text)


[docs]def unicode(text: str, *, form: str = "NFC") -> str:
    """
    Normalize unicode characters in ``text`` into canonical forms.

    Args:
        text
        form ({"NFC", "NFD", "NFKC", "NFKD"}): Form of normalization applied to
            unicode characters. For example, an "e" with accute accent "´" can be
            written as "e´" (canonical decomposition, "NFD") or "é" (canonical
            composition, "NFC"). Unicode can be normalized to NFC form
            without any change in meaning, so it's usually a safe bet. If "NFKC",
            additional normalizations are applied that can change characters' meanings,
            e.g. ellipsis characters are replaced with three periods.

    See Also:
        https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
    """
    return unicodedata.normalize(form, text)


[docs]def whitespace(text: str) -> str:
    """
    Replace all contiguous zero-width spaces with an empty string, line-breaking spaces
    with a single newline, and non-breaking spaces with a single space, then
    strip any leading/trailing whitespace.
    """
    text = resources.RE_ZWSP.sub("", text)
    text = resources.RE_LINEBREAK.sub(r"\n", text)
    text = resources.RE_NONBREAKING_SPACE.sub(" ", text)
    return text.strip()
Source code for textacy.preprocessing.normalize

Navigation

Related Topics