Source code for textacy.preprocessing.normalize
"""
Normalize
---------
:mod:`textacy.preprocessing.normalize`: Normalize aspects of raw text that may vary
in problematic ways.
"""
import re
import unicodedata
from . import resources
[docs]def bullet_points(text: str) -> str:
"""
Normalize all "fancy" bullet point symbols in ``text`` to just the basic ASCII "-",
provided they are the first non-whitespace characters on a new line
(like a list of items).
"""
return resources.RE_BULLET_POINTS.sub(r"\1-", text)
[docs]def hyphenated_words(text: str) -> str:
"""
Normalize words in ``text`` that have been split across lines by a hyphen
for visual consistency (aka hyphenated) by joining the pieces back together,
sans hyphen and whitespace.
"""
return resources.RE_HYPHENATED_WORD.sub(r"\1\2", text)
[docs]def quotation_marks(text: str) -> str:
"""
Normalize all "fancy" single- and double-quotation marks in ``text``
to just the basic ASCII equivalents. Note that this will also normalize fancy
apostrophes, which are typically represented as single quotation marks.
"""
return text.translate(resources.QUOTE_TRANSLATION_TABLE)
[docs]def repeating_chars(text: str, *, chars: str, maxn: int = 1) -> str:
"""
Normalize repeating characters in ``text`` by truncating their number of consecutive
repetitions to ``maxn``.
Args:
text
chars: One or more characters whose consecutive repetitions are to be normalized,
e.g. "." or "?!".
maxn: Maximum number of consecutive repetitions of ``chars`` to which
longer repetitions will be truncated.
Returns:
str
"""
return re.sub(r"({}){{{},}}".format(re.escape(chars), maxn + 1), chars * maxn, text)
[docs]def unicode(text: str, *, form: str = "NFC") -> str:
"""
Normalize unicode characters in ``text`` into canonical forms.
Args:
text
form ({"NFC", "NFD", "NFKC", "NFKD"}): Form of normalization applied to
unicode characters. For example, an "e" with accute accent "´" can be
written as "e´" (canonical decomposition, "NFD") or "é" (canonical
composition, "NFC"). Unicode can be normalized to NFC form
without any change in meaning, so it's usually a safe bet. If "NFKC",
additional normalizations are applied that can change characters' meanings,
e.g. ellipsis characters are replaced with three periods.
See Also:
https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
"""
return unicodedata.normalize(form, text)
[docs]def whitespace(text: str) -> str:
"""
Replace all contiguous zero-width spaces with an empty string, line-breaking spaces
with a single newline, and non-breaking spaces with a single space, then
strip any leading/trailing whitespace.
"""
text = resources.RE_ZWSP.sub("", text)
text = resources.RE_LINEBREAK.sub(r"\n", text)
text = resources.RE_NONBREAKING_SPACE.sub(" ", text)
return text.strip()