Source code for textacy.extract.kwic

"""
KWIC
----

:mod:`textacy.extract.kwic`: Extract keywords with their surrounding contexts from
a text document using regular expressions.
"""
from __future__ import annotations

import re
from typing import Iterable, Pattern, Tuple

from spacy.tokens import Doc


[docs]def keyword_in_context(
    doc: Doc | str,
    keyword: str | Pattern,
    *,
    ignore_case: bool = True,
    window_width: int = 50,
    pad_context: bool = False,
) -> Iterable[Tuple[str, str, str]]:
    """
    Search for ``keyword`` matches in ``doc`` via regular expression and yield matches
    along with ``window_width`` characters of context before and after occurrence.

    Args:
        doc: spaCy ``Doc`` or raw text in which to search for ``keyword``. If a ``Doc``,
            constituent text is grabbed via :attr:`spacy.tokens.Doc.text`. Note that
            spaCy annotations aren't used at all here, they're just a convenient
            owner of document text.
        keyword: String or regular expression pattern defining the keyword(s) to match.
            Typically, this is a single word or short phrase ("spam", "spam and eggs"),
            but to account for variations, use regex (``r"[Ss]pam (and|&) [Ee]ggs?"``),
            optionally compiled (``re.compile(r"[Ss]pam (and|&) [Ee]ggs?")``).
        ignore_case: If True, ignore letter case in ``keyword`` matching; otherwise,
            use case-sensitive matching. Note that this argument is only used if
            ``keyword`` is a string; for pre-compiled regular expressions,
            the ``re.IGNORECASE`` flag is left as-is.
        window_width: Number of characters on either side of ``keyword``
            to include as "context".
        pad_context: If True, pad pre- and post-context strings to ``window_width``
            chars in length; otherwise, us as many chars as are found in the text,
            up to the specified width.

    Yields:
        Next matching triple of (pre-context, keyword match, post-context).
    """
    text = doc.text if isinstance(doc, Doc) else doc
    if isinstance(keyword, str):
        flags = re.IGNORECASE if ignore_case is True else 0
        matches = re.finditer(keyword, text, flags=flags)
    else:
        matches = keyword.finditer(text)
    for match in matches:
        pre_context = text[max(0, match.start() - window_width) : match.start()]
        post_context = text[match.end() : match.end() + window_width]
        if pad_context is True:
            pre_context = pre_context.rjust(window_width)
            post_context = post_context.ljust(window_width)

        yield (pre_context, match.group(), post_context)
Source code for textacy.extract.kwic

Navigation

Related Topics