Source code for textacy.extract.kwic

"""
KWIC
----

:mod:`textacy.extract.kwic`: Extract keywords with their surrounding contexts from
a text document using regular expressions.
"""
from __future__ import annotations

import re
from typing import Iterable, Pattern, Tuple

from spacy.tokens import Doc


[docs]def keyword_in_context( doc: Doc | str, keyword: str | Pattern, *, ignore_case: bool = True, window_width: int = 50, pad_context: bool = False, ) -> Iterable[Tuple[str, str, str]]: """ Search for ``keyword`` matches in ``doc`` via regular expression and yield matches along with ``window_width`` characters of context before and after occurrence. Args: doc: spaCy ``Doc`` or raw text in which to search for ``keyword``. If a ``Doc``, constituent text is grabbed via :attr:`spacy.tokens.Doc.text`. Note that spaCy annotations aren't used at all here, they're just a convenient owner of document text. keyword: String or regular expression pattern defining the keyword(s) to match. Typically, this is a single word or short phrase ("spam", "spam and eggs"), but to account for variations, use regex (``r"[Ss]pam (and|&) [Ee]ggs?"``), optionally compiled (``re.compile(r"[Ss]pam (and|&) [Ee]ggs?")``). ignore_case: If True, ignore letter case in ``keyword`` matching; otherwise, use case-sensitive matching. Note that this argument is only used if ``keyword`` is a string; for pre-compiled regular expressions, the ``re.IGNORECASE`` flag is left as-is. window_width: Number of characters on either side of ``keyword`` to include as "context". pad_context: If True, pad pre- and post-context strings to ``window_width`` chars in length; otherwise, us as many chars as are found in the text, up to the specified width. Yields: Next matching triple of (pre-context, keyword match, post-context). """ text = doc.text if isinstance(doc, Doc) else doc if isinstance(keyword, str): flags = re.IGNORECASE if ignore_case is True else 0 matches = re.finditer(keyword, text, flags=flags) else: matches = keyword.finditer(text) for match in matches: pre_context = text[max(0, match.start() - window_width) : match.start()] post_context = text[match.end() : match.end() + window_width] if pad_context is True: pre_context = pre_context.rjust(window_width) post_context = post_context.ljust(window_width) yield (pre_context, match.group(), post_context)