Source code for textacy.augmentation.augmenter

from __future__ import annotations

import random
from typing import Callable, List, Optional, Sequence, Tuple

from spacy.tokens import Doc

from .. import spacier, types, utils
from . import utils as aug_utils


AugTransform = Callable[[List[aug_utils.AugTok]], List[aug_utils.AugTok]]


[docs]class Augmenter:
    """
    Randomly apply one or many data augmentation transforms to spaCy ``Doc`` s
    to produce new docs with additional variety and/or noise in the data.

    Initialize an ``Augmenter`` with multiple transforms, and customize the randomization
    of their selection when applying to a document::

        >>> tfs = [transforms.delete_words, transforms.swap_chars, transforms.delete_chars]
        >>> Augmenter(tfs, num=None)  # all tfs applied each time
        >>> Augmenter(tfs, num=1)  # one randomly-selected tf applied each time
        >>> Augmenter(tfs, num=0.5)  # tfs randomly selected with 50% prob each time
        >>> augmenter = Augmenter(tfs, num=[0.4, 0.8, 0.6])  # tfs randomly selected with 40%, 80%, 60% probs, respectively, each time

    Apply transforms to a given ``Doc`` to produce new documents::

        >>> text = "The quick brown fox jumps over the lazy dog."
        >>> doc = textacy.make_spacy_doc(text, lang="en_core_web_sm")
        >>> augmenter.apply_transforms(doc, lang="en_core_web_sm")
        The quick brown ox jupms over the lazy dog.
        >>> augmenter.apply_transforms(doc, lang="en_core_web_sm")
        The quikc brown fox over the lazy dog.
        >>> augmenter.apply_transforms(doc, lang="en_core_web_sm")
        quick brown fox jumps over teh lazy dog.

    Parameters for individual transforms may be specified when initializing ``Augmenter``
    or, if necessary, when applying to individual documents::

        >>> from functools import partial
        >>> tfs = [partial(transforms.delete_words, num=3), transforms.swap_chars]
        >>> augmenter = Augmenter(tfs)
        >>> augmenter.apply_transforms(doc, lang="en_core_web_sm")
        brown fox jumps over layz dog.
        >>> augmenter.apply_transforms(doc, lang="en_core_web_sm", pos={"NOUN", "ADJ"})
        The jumps over the lazy odg.

    Args:
        transforms: Ordered sequence of callables that must take List[:obj:`AugTok`]
            as their first positional argument and return another List[:obj:`AugTok`].

            .. note:: Although the particular transforms applied may vary doc-by-doc,
               they are applied *in order* as listed here. Since some transforms may
               clobber text in a way that makes other transforms less effective,
               a stable ordering can improve the quality of augmented data.

        num: If int, number of transforms to randomly select from ``transforms`` each time
            :meth:`Augmenter.apply_tranforms()` is called.
            If float, probability that any given transform will be selected.
            If Sequence[float], the probability that the corresponding transform
            in ``transforms`` will be selected (these must be the same length).
            If None (default), num is set to ``len(transforms)``, which means that
            every transform is applied each time.

    See Also:
        A collection of general-purpose transforms are implemented in
        :mod:`textacy.augmentation.transforms`.
    """

    def __init__(
        self,
        transforms: Sequence[AugTransform],
        *,
        num: Optional[int | float | Sequence[float]] = None,
    ):
        self.tfs = self._validate_transforms(transforms)
        self.num = self._validate_num(num)

[docs]    def apply_transforms(self, doc: Doc, lang: types.LangLike, **kwargs) -> Doc:
        """
        Sequentially apply some subset of data augmentation transforms to ``doc``,
        then return a new ``Doc`` created from the augmented text using ``lang``.

        Args:
            doc
            lang
            **kwargs: If, for whatever reason, you have to pass keyword argument values
                into transforms that vary or depend on characteristics of ``doc``,
                specify them here. The transforms' call signatures will be inspected,
                and values will be passed along, as needed.

        Returns:
            :class:`spacy.tokens.Doc`
        """
        if doc.has_annotation("SENT_START"):
            nested_aug_toks = [aug_utils.to_aug_toks(sent) for sent in doc.sents]
        else:
            nested_aug_toks = [aug_utils.to_aug_toks(doc)]
        tfs = self._get_random_transforms()
        new_nested_aug_toks = []
        for aug_toks in nested_aug_toks:
            # this is a bit of a hack, but whatchagonnado
            if kwargs:
                for tf in tfs:
                    tf_kwargs = utils.get_kwargs_for_func(tf, kwargs)
                    aug_toks = tf(aug_toks, **tf_kwargs)
            else:
                for tf in tfs:
                    aug_toks = tf(aug_toks)
            new_nested_aug_toks.append(aug_toks)
        return self._make_new_spacy_doc(new_nested_aug_toks, lang)

    def _validate_transforms(
        self, transforms: Sequence[AugTransform]
    ) -> Tuple[AugTransform]:
        transforms = tuple(transforms)
        if not transforms:
            raise ValueError("at least one transform callable must be specified")
        elif not all(callable(transform) for transform in transforms):
            raise TypeError("all transforms must be callable")
        else:
            return transforms

    def _validate_num(
        self, num: Optional[int | float | Sequence[float]]
    ) -> int | float | Tuple[float]:
        if num is None:
            return len(self.tfs)
        elif isinstance(num, int) and 0 <= num <= len(self.tfs):
            return num
        elif isinstance(num, float) and 0.0 <= num <= 1.0:
            return num
        elif (
            isinstance(num, (tuple, list))
            and len(num) == len(self.tfs)
            and all(isinstance(n, float) and 0.0 <= n <= 1.0 for n in num)
        ):
            return tuple(num)
        else:
            raise ValueError(
                f"num={num} is invalid; must be an int >= 1, a float in [0.0, 1.0], "
                "or a list of floats of length equal to given transforms"
            )

    def _get_random_transforms(self) -> List[AugTransform]:
        num = self.num
        if isinstance(num, int):
            rand_idxs = random.sample(range(len(self.tfs)), min(num, len(self.tfs)))
            rand_tfs = [self.tfs[idx] for idx in sorted(rand_idxs)]
        elif isinstance(num, float):
            rand_tfs = [tf for tf in self.tfs if random.random() < num]
        else:
            rand_tfs = [
                tf for tf, tf_num in zip(self.tfs, self.num) if random.random() < tf_num
            ]
        return rand_tfs

    def _make_new_spacy_doc(self, nested_aug_tokens, lang: types.LangLike) -> Doc:
        # TODO: maybe collect words, spaces, and array vals
        # then directly instantiate a new Doc object?
        # this would require adding an array field to AugTok
        new_text = "".join(
            aug_tok.text + aug_tok.ws
            for aug_toks in nested_aug_tokens
            for aug_tok in aug_toks
        )
        return spacier.core.make_spacy_doc(new_text, lang=lang)
Source code for textacy.augmentation.augmenter

Navigation

Related Topics